1 /* 2 * Copyright 2019 Google LLC 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "include/core/SkStream.h" 9 #include "include/core/SkString.h" 10 #include "include/private/SkChecksum.h" 11 #include "include/private/SkHalf.h" 12 #include "include/private/SkSpinlock.h" 13 #include "include/private/SkTFitsIn.h" 14 #include "include/private/SkThreadID.h" 15 #include "include/private/SkVx.h" 16 #include "src/core/SkColorSpaceXformSteps.h" 17 #include "src/core/SkCpu.h" 18 #include "src/core/SkEnumerate.h" 19 #include "src/core/SkOpts.h" 20 #include "src/core/SkVM.h" 21 #include <algorithm> 22 #include <atomic> 23 #include <queue> 24 25 #if defined(SKVM_LLVM) 26 #include <future> 27 #include <llvm/Bitcode/BitcodeWriter.h> 28 #include <llvm/ExecutionEngine/ExecutionEngine.h> 29 #include <llvm/IR/IRBuilder.h> 30 #include <llvm/IR/Verifier.h> 31 #include <llvm/Support/TargetSelect.h> 32 33 // Platform-specific intrinsics got their own files in LLVM 10. 34 #if __has_include(<llvm/IR/IntrinsicsX86.h>) 35 #include <llvm/IR/IntrinsicsX86.h> 36 #endif 37 #endif 38 39 bool gSkVMAllowJIT{false}; 40 bool gSkVMJITViaDylib{false}; 41 42 #if defined(SKVM_JIT) 43 #if defined(SK_BUILD_FOR_WIN) 44 #include "src/core/SkLeanWindows.h" 45 #include <memoryapi.h> 46 alloc_jit_buffer(size_t * len)47 static void* alloc_jit_buffer(size_t* len) { 48 return VirtualAlloc(NULL, *len, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); 49 } unmap_jit_buffer(void * ptr,size_t len)50 static void unmap_jit_buffer(void* ptr, size_t len) { 51 VirtualFree(ptr, 0, MEM_RELEASE); 52 } remap_as_executable(void * ptr,size_t len)53 static void remap_as_executable(void* ptr, size_t len) { 54 DWORD old; 55 VirtualProtect(ptr, len, PAGE_EXECUTE_READ, &old); 56 SkASSERT(old == PAGE_READWRITE); 57 } close_dylib(void * dylib)58 static void close_dylib(void* dylib) { 59 SkASSERT(false); // TODO? For now just assert we never make one. 60 } 61 #else 62 #include <dlfcn.h> 63 #include <sys/mman.h> 64 alloc_jit_buffer(size_t * len)65 static void* alloc_jit_buffer(size_t* len) { 66 // While mprotect and VirtualAlloc both work at page granularity, 67 // mprotect doesn't round up for you, and instead requires *len is at page granularity. 68 const size_t page = sysconf(_SC_PAGESIZE); 69 *len = ((*len + page - 1) / page) * page; 70 return mmap(nullptr,*len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0); 71 } unmap_jit_buffer(void * ptr,size_t len)72 static void unmap_jit_buffer(void* ptr, size_t len) { 73 munmap(ptr, len); 74 } remap_as_executable(void * ptr,size_t len)75 static void remap_as_executable(void* ptr, size_t len) { 76 mprotect(ptr, len, PROT_READ|PROT_EXEC); 77 __builtin___clear_cache((char*)ptr, 78 (char*)ptr + len); 79 } close_dylib(void * dylib)80 static void close_dylib(void* dylib) { 81 dlclose(dylib); 82 } 83 #endif 84 85 #if defined(SKVM_JIT_VTUNE) 86 #include <jitprofiling.h> notify_vtune(const char * name,void * addr,size_t len)87 static void notify_vtune(const char* name, void* addr, size_t len) { 88 if (iJIT_IsProfilingActive() == iJIT_SAMPLING_ON) { 89 iJIT_Method_Load event; 90 memset(&event, 0, sizeof(event)); 91 event.method_id = iJIT_GetNewMethodID(); 92 event.method_name = const_cast<char*>(name); 93 event.method_load_address = addr; 94 event.method_size = len; 95 iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &event); 96 } 97 } 98 #else notify_vtune(const char * name,void * addr,size_t len)99 static void notify_vtune(const char* name, void* addr, size_t len) {} 100 #endif 101 #endif 102 103 // JIT code isn't MSAN-instrumented, so we won't see when it uses 104 // uninitialized memory, and we'll not see the writes it makes as properly 105 // initializing memory. Instead force the interpreter, which should let 106 // MSAN see everything our programs do properly. 107 // 108 // Similarly, we can't get ASAN's checks unless we let it instrument our interpreter. 109 #if defined(__has_feature) 110 #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer) 111 #define SKVM_JIT_BUT_IGNORE_IT 112 #endif 113 #endif 114 115 116 117 namespace skvm { 118 detect_features()119 static Features detect_features() { 120 static const bool fma = 121 #if defined(SK_CPU_X86) 122 SkCpu::Supports(SkCpu::HSW); 123 #elif defined(SK_CPU_ARM64) 124 true; 125 #else 126 false; 127 #endif 128 129 static const bool fp16 = false; // TODO 130 131 return { fma, fp16 }; 132 } 133 Builder()134 Builder::Builder() : fFeatures(detect_features()) {} Builder(Features features)135 Builder::Builder(Features features) : fFeatures(features ) {} 136 137 138 struct Program::Impl { 139 std::vector<InterpreterInstruction> instructions; 140 int regs = 0; 141 int loop = 0; 142 std::vector<int> strides; 143 144 std::atomic<void*> jit_entry{nullptr}; // TODO: minimal std::memory_orders 145 size_t jit_size = 0; 146 void* dylib = nullptr; 147 148 #if defined(SKVM_LLVM) 149 std::unique_ptr<llvm::LLVMContext> llvm_ctx; 150 std::unique_ptr<llvm::ExecutionEngine> llvm_ee; 151 std::future<void> llvm_compiling; 152 #endif 153 }; 154 155 // Debugging tools, mostly for printing various data structures out to a stream. 156 157 namespace { 158 class SkDebugfStream final : public SkWStream { 159 size_t fBytesWritten = 0; 160 write(const void * buffer,size_t size)161 bool write(const void* buffer, size_t size) override { 162 SkDebugf("%.*s", size, buffer); 163 fBytesWritten += size; 164 return true; 165 } 166 bytesWritten() const167 size_t bytesWritten() const override { 168 return fBytesWritten; 169 } 170 }; 171 172 struct V { Val id; }; 173 struct R { Reg id; }; 174 struct Shift { int bits; }; 175 struct Splat { int bits; }; 176 struct Hex { int bits; }; 177 write(SkWStream * o,const char * s)178 static void write(SkWStream* o, const char* s) { 179 o->writeText(s); 180 } 181 name(Op op)182 static const char* name(Op op) { 183 switch (op) { 184 #define M(x) case Op::x: return #x; 185 SKVM_OPS(M) 186 #undef M 187 } 188 return "unknown op"; 189 } 190 write(SkWStream * o,Op op)191 static void write(SkWStream* o, Op op) { 192 o->writeText(name(op)); 193 } write(SkWStream * o,Ptr p)194 static void write(SkWStream* o, Ptr p) { 195 write(o, "ptr"); 196 o->writeDecAsText(p.ix); 197 } write(SkWStream * o,V v)198 static void write(SkWStream* o, V v) { 199 write(o, "v"); 200 o->writeDecAsText(v.id); 201 } write(SkWStream * o,R r)202 static void write(SkWStream* o, R r) { 203 write(o, "r"); 204 o->writeDecAsText(r.id); 205 } write(SkWStream * o,Shift s)206 static void write(SkWStream* o, Shift s) { 207 o->writeDecAsText(s.bits); 208 } write(SkWStream * o,Splat s)209 static void write(SkWStream* o, Splat s) { 210 float f; 211 memcpy(&f, &s.bits, 4); 212 o->writeHexAsText(s.bits); 213 write(o, " ("); 214 o->writeScalarAsText(f); 215 write(o, ")"); 216 } write(SkWStream * o,Hex h)217 static void write(SkWStream* o, Hex h) { 218 o->writeHexAsText(h.bits); 219 } 220 221 template <typename T, typename... Ts> write(SkWStream * o,T first,Ts...rest)222 static void write(SkWStream* o, T first, Ts... rest) { 223 write(o, first); 224 write(o, " "); 225 write(o, rest...); 226 } 227 } // namespace 228 write_one_instruction(Val id,const OptimizedInstruction & inst,SkWStream * o)229 static void write_one_instruction(Val id, const OptimizedInstruction& inst, SkWStream* o) { 230 Op op = inst.op; 231 Val x = inst.x, 232 y = inst.y, 233 z = inst.z, 234 w = inst.w; 235 int immA = inst.immA, 236 immB = inst.immB; 237 switch (op) { 238 case Op::assert_true: write(o, op, V{x}, V{y}); break; 239 240 case Op::store8: write(o, op, Ptr{immA}, V{x} ); break; 241 case Op::store16: write(o, op, Ptr{immA}, V{x} ); break; 242 case Op::store32: write(o, op, Ptr{immA}, V{x} ); break; 243 case Op::store64: write(o, op, Ptr{immA}, V{x},V{y} ); break; 244 case Op::store128: write(o, op, Ptr{immA}, V{x},V{y},V{z},V{w}); break; 245 246 case Op::index: write(o, V{id}, "=", op); break; 247 248 case Op::load8: write(o, V{id}, "=", op, Ptr{immA}); break; 249 case Op::load16: write(o, V{id}, "=", op, Ptr{immA}); break; 250 case Op::load32: write(o, V{id}, "=", op, Ptr{immA}); break; 251 case Op::load64: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break; 252 case Op::load128: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break; 253 254 case Op::gather8: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break; 255 case Op::gather16: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break; 256 case Op::gather32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break; 257 258 case Op::uniform32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break; 259 260 case Op::splat: write(o, V{id}, "=", op, Splat{immA}); break; 261 262 case Op:: add_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 263 case Op:: sub_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 264 case Op:: mul_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 265 case Op:: div_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 266 case Op:: min_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 267 case Op:: max_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 268 case Op:: fma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 269 case Op:: fms_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 270 case Op::fnma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 271 272 273 case Op::sqrt_f32: write(o, V{id}, "=", op, V{x}); break; 274 275 case Op:: eq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 276 case Op::neq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 277 case Op:: gt_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 278 case Op::gte_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 279 280 281 case Op::add_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 282 case Op::sub_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 283 case Op::mul_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 284 285 case Op::shl_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break; 286 case Op::shr_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break; 287 case Op::sra_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break; 288 289 case Op::eq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 290 case Op::gt_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 291 292 293 case Op::bit_and : write(o, V{id}, "=", op, V{x}, V{y}); break; 294 case Op::bit_or : write(o, V{id}, "=", op, V{x}, V{y}); break; 295 case Op::bit_xor : write(o, V{id}, "=", op, V{x}, V{y}); break; 296 case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y}); break; 297 298 case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 299 300 case Op::ceil: write(o, V{id}, "=", op, V{x}); break; 301 case Op::floor: write(o, V{id}, "=", op, V{x}); break; 302 case Op::to_f32: write(o, V{id}, "=", op, V{x}); break; 303 case Op::to_fp16: write(o, V{id}, "=", op, V{x}); break; 304 case Op::from_fp16: write(o, V{id}, "=", op, V{x}); break; 305 case Op::trunc: write(o, V{id}, "=", op, V{x}); break; 306 case Op::round: write(o, V{id}, "=", op, V{x}); break; 307 } 308 309 write(o, "\n"); 310 } 311 dump(SkWStream * o) const312 void Builder::dump(SkWStream* o) const { 313 SkDebugfStream debug; 314 if (!o) { o = &debug; } 315 316 std::vector<OptimizedInstruction> optimized = this->optimize(); 317 o->writeDecAsText(optimized.size()); 318 o->writeText(" values (originally "); 319 o->writeDecAsText(fProgram.size()); 320 o->writeText("):\n"); 321 for (Val id = 0; id < (Val)optimized.size(); id++) { 322 const OptimizedInstruction& inst = optimized[id]; 323 write(o, inst.can_hoist ? "↑ " : " "); 324 write_one_instruction(id, inst, o); 325 } 326 } 327 dump(SkWStream * o) const328 void Program::dump(SkWStream* o) const { 329 SkDebugfStream debug; 330 if (!o) { o = &debug; } 331 332 o->writeDecAsText(fImpl->regs); 333 o->writeText(" registers, "); 334 o->writeDecAsText(fImpl->instructions.size()); 335 o->writeText(" instructions:\n"); 336 for (Val i = 0; i < (Val)fImpl->instructions.size(); i++) { 337 if (i == fImpl->loop) { write(o, "loop:\n"); } 338 o->writeDecAsText(i); 339 o->writeText("\t"); 340 if (i >= fImpl->loop) { write(o, " "); } 341 const InterpreterInstruction& inst = fImpl->instructions[i]; 342 Op op = inst.op; 343 Reg d = inst.d, 344 x = inst.x, 345 y = inst.y, 346 z = inst.z, 347 w = inst.w; 348 int immA = inst.immA, 349 immB = inst.immB; 350 switch (op) { 351 case Op::assert_true: write(o, op, R{x}, R{y}); break; 352 353 case Op::store8: write(o, op, Ptr{immA}, R{x} ); break; 354 case Op::store16: write(o, op, Ptr{immA}, R{x} ); break; 355 case Op::store32: write(o, op, Ptr{immA}, R{x} ); break; 356 case Op::store64: write(o, op, Ptr{immA}, R{x}, R{y} ); break; 357 case Op::store128: write(o, op, Ptr{immA}, R{x}, R{y}, R{z}, R{w}); break; 358 359 case Op::index: write(o, R{d}, "=", op); break; 360 361 case Op::load8: write(o, R{d}, "=", op, Ptr{immA}); break; 362 case Op::load16: write(o, R{d}, "=", op, Ptr{immA}); break; 363 case Op::load32: write(o, R{d}, "=", op, Ptr{immA}); break; 364 case Op::load64: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break; 365 case Op::load128: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break; 366 367 case Op::gather8: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break; 368 case Op::gather16: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break; 369 case Op::gather32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break; 370 371 case Op::uniform32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break; 372 373 case Op::splat: write(o, R{d}, "=", op, Splat{immA}); break; 374 375 case Op::add_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 376 case Op::sub_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 377 case Op::mul_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 378 case Op::div_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 379 case Op::min_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 380 case Op::max_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 381 case Op::fma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 382 case Op::fms_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 383 case Op::fnma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 384 385 case Op::sqrt_f32: write(o, R{d}, "=", op, R{x}); break; 386 387 case Op:: eq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 388 case Op::neq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 389 case Op:: gt_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 390 case Op::gte_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 391 392 393 case Op::add_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 394 case Op::sub_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 395 case Op::mul_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 396 397 case Op::shl_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break; 398 case Op::shr_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break; 399 case Op::sra_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break; 400 401 case Op::eq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 402 case Op::gt_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 403 404 case Op::bit_and : write(o, R{d}, "=", op, R{x}, R{y}); break; 405 case Op::bit_or : write(o, R{d}, "=", op, R{x}, R{y}); break; 406 case Op::bit_xor : write(o, R{d}, "=", op, R{x}, R{y}); break; 407 case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y}); break; 408 409 case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 410 411 case Op::ceil: write(o, R{d}, "=", op, R{x}); break; 412 case Op::floor: write(o, R{d}, "=", op, R{x}); break; 413 case Op::to_f32: write(o, R{d}, "=", op, R{x}); break; 414 case Op::to_fp16: write(o, R{d}, "=", op, R{x}); break; 415 case Op::from_fp16: write(o, R{d}, "=", op, R{x}); break; 416 case Op::trunc: write(o, R{d}, "=", op, R{x}); break; 417 case Op::round: write(o, R{d}, "=", op, R{x}); break; 418 } 419 write(o, "\n"); 420 } 421 } 422 eliminate_dead_code(std::vector<Instruction> program)423 std::vector<Instruction> eliminate_dead_code(std::vector<Instruction> program) { 424 // Determine which Instructions are live by working back from side effects. 425 std::vector<bool> live(program.size(), false); 426 auto mark_live = [&](Val id, auto& recurse) -> void { 427 if (live[id] == false) { 428 live[id] = true; 429 Instruction inst = program[id]; 430 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) { 431 if (arg != NA) { recurse(arg, recurse); } 432 } 433 } 434 }; 435 for (Val id = 0; id < (Val)program.size(); id++) { 436 if (has_side_effect(program[id].op)) { 437 mark_live(id, mark_live); 438 } 439 } 440 441 // Rewrite the program with only live Instructions: 442 // - remap IDs in live Instructions to what they'll be once dead Instructions are removed; 443 // - then actually remove the dead Instructions. 444 std::vector<Val> new_id(program.size(), NA); 445 for (Val id = 0, next = 0; id < (Val)program.size(); id++) { 446 if (live[id]) { 447 Instruction& inst = program[id]; 448 for (Val* arg : {&inst.x, &inst.y, &inst.z, &inst.w}) { 449 if (*arg != NA) { 450 *arg = new_id[*arg]; 451 SkASSERT(*arg != NA); 452 } 453 } 454 new_id[id] = next++; 455 } 456 } 457 auto it = std::remove_if(program.begin(), program.end(), [&](const Instruction& inst) { 458 Val id = (Val)(&inst - program.data()); 459 return !live[id]; 460 }); 461 program.erase(it, program.end()); 462 463 return program; 464 } 465 finalize(const std::vector<Instruction> program)466 std::vector<OptimizedInstruction> finalize(const std::vector<Instruction> program) { 467 std::vector<OptimizedInstruction> optimized(program.size()); 468 for (Val id = 0; id < (Val)program.size(); id++) { 469 Instruction inst = program[id]; 470 optimized[id] = {inst.op, inst.x,inst.y,inst.z,inst.w, inst.immA,inst.immB, 471 /*death=*/id, /*can_hoist=*/true}; 472 } 473 474 // Each Instruction's inputs need to live at least until that Instruction issues. 475 for (Val id = 0; id < (Val)optimized.size(); id++) { 476 OptimizedInstruction& inst = optimized[id]; 477 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) { 478 // (We're walking in order, so this is the same as max()ing with the existing Val.) 479 if (arg != NA) { optimized[arg].death = id; } 480 } 481 } 482 483 // Mark which values don't depend on the loop and can be hoisted. 484 for (OptimizedInstruction& inst : optimized) { 485 // Varying loads (and gathers) and stores cannot be hoisted out of the loop. 486 if (is_always_varying(inst.op)) { 487 inst.can_hoist = false; 488 } 489 490 // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself. 491 if (inst.can_hoist) { 492 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) { 493 if (arg != NA) { inst.can_hoist &= optimized[arg].can_hoist; } 494 } 495 } 496 } 497 498 // Extend the lifetime of any hoisted value that's used in the loop to infinity. 499 for (OptimizedInstruction& inst : optimized) { 500 if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used-in-loop*/) { 501 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) { 502 if (arg != NA && optimized[arg].can_hoist) { 503 optimized[arg].death = (Val)program.size(); 504 } 505 } 506 } 507 } 508 509 return optimized; 510 } 511 optimize() const512 std::vector<OptimizedInstruction> Builder::optimize() const { 513 std::vector<Instruction> program = this->program(); 514 program = eliminate_dead_code(std::move(program)); 515 return finalize (std::move(program)); 516 } 517 done(const char * debug_name,bool allow_jit) const518 Program Builder::done(const char* debug_name, bool allow_jit) const { 519 char buf[64] = "skvm-jit-"; 520 if (!debug_name) { 521 *SkStrAppendU32(buf+9, this->hash()) = '\0'; 522 debug_name = buf; 523 } 524 525 return {this->optimize(), fStrides, debug_name, allow_jit}; 526 } 527 hash() const528 uint64_t Builder::hash() const { 529 uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 0), 530 hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 1); 531 return (uint64_t)lo | (uint64_t)hi << 32; 532 } 533 operator ==(const Instruction & a,const Instruction & b)534 bool operator==(const Instruction& a, const Instruction& b) { 535 return a.op == b.op 536 && a.x == b.x 537 && a.y == b.y 538 && a.z == b.z 539 && a.w == b.w 540 && a.immA == b.immA 541 && a.immB == b.immB; 542 } 543 operator ()(const Instruction & inst,uint32_t seed) const544 uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const { 545 return SkOpts::hash(&inst, sizeof(inst), seed); 546 } 547 548 549 // Most instructions produce a value and return it by ID, 550 // the value-producing instruction's own index in the program vector. push(Instruction inst)551 Val Builder::push(Instruction inst) { 552 // Basic common subexpression elimination: 553 // if we've already seen this exact Instruction, use it instead of creating a new one. 554 // 555 // But we never dedup loads or stores: an intervening store could change that memory. 556 // Uniforms and gathers touch only uniform memory, so they're fine to dedup, 557 // and index is varying but doesn't touch memory, so it's fine to dedup too. 558 if (!touches_varying_memory(inst.op)) { 559 if (Val* id = fIndex.find(inst)) { 560 return *id; 561 } 562 } 563 Val id = static_cast<Val>(fProgram.size()); 564 fProgram.push_back(inst); 565 fIndex.set(inst, id); 566 return id; 567 } 568 arg(int stride)569 Ptr Builder::arg(int stride) { 570 int ix = (int)fStrides.size(); 571 fStrides.push_back(stride); 572 return {ix}; 573 } 574 assert_true(I32 cond,I32 debug)575 void Builder::assert_true(I32 cond, I32 debug) { 576 #ifdef SK_DEBUG 577 int imm; 578 if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; } 579 (void)push(Op::assert_true, cond.id, debug.id); 580 #endif 581 } 582 store8(Ptr ptr,I32 val)583 void Builder::store8 (Ptr ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA,NA, ptr.ix); } store16(Ptr ptr,I32 val)584 void Builder::store16(Ptr ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA,NA, ptr.ix); } store32(Ptr ptr,I32 val)585 void Builder::store32(Ptr ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA,NA, ptr.ix); } store64(Ptr ptr,I32 lo,I32 hi)586 void Builder::store64(Ptr ptr, I32 lo, I32 hi) { 587 (void)push(Op::store64, lo.id,hi.id,NA,NA, ptr.ix); 588 } store128(Ptr ptr,I32 x,I32 y,I32 z,I32 w)589 void Builder::store128(Ptr ptr, I32 x, I32 y, I32 z, I32 w) { 590 (void)push(Op::store128, x.id,y.id,z.id,w.id, ptr.ix); 591 } 592 index()593 I32 Builder::index() { return {this, push(Op::index)}; } 594 load8(Ptr ptr)595 I32 Builder::load8 (Ptr ptr) { return {this, push(Op::load8 , NA,NA,NA,NA, ptr.ix) }; } load16(Ptr ptr)596 I32 Builder::load16(Ptr ptr) { return {this, push(Op::load16, NA,NA,NA,NA, ptr.ix) }; } load32(Ptr ptr)597 I32 Builder::load32(Ptr ptr) { return {this, push(Op::load32, NA,NA,NA,NA, ptr.ix) }; } load64(Ptr ptr,int lane)598 I32 Builder::load64(Ptr ptr, int lane) { 599 return {this, push(Op::load64 , NA,NA,NA,NA, ptr.ix,lane) }; 600 } load128(Ptr ptr,int lane)601 I32 Builder::load128(Ptr ptr, int lane) { 602 return {this, push(Op::load128, NA,NA,NA,NA, ptr.ix,lane) }; 603 } 604 gather8(Ptr ptr,int offset,I32 index)605 I32 Builder::gather8 (Ptr ptr, int offset, I32 index) { 606 return {this, push(Op::gather8 , index.id,NA,NA,NA, ptr.ix,offset)}; 607 } gather16(Ptr ptr,int offset,I32 index)608 I32 Builder::gather16(Ptr ptr, int offset, I32 index) { 609 return {this, push(Op::gather16, index.id,NA,NA,NA, ptr.ix,offset)}; 610 } gather32(Ptr ptr,int offset,I32 index)611 I32 Builder::gather32(Ptr ptr, int offset, I32 index) { 612 return {this, push(Op::gather32, index.id,NA,NA,NA, ptr.ix,offset)}; 613 } 614 uniform32(Ptr ptr,int offset)615 I32 Builder::uniform32(Ptr ptr, int offset) { 616 return {this, push(Op::uniform32, NA,NA,NA,NA, ptr.ix, offset)}; 617 } 618 splat(int n)619 I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA,NA, n) }; } 620 621 // Be careful peepholing float math! Transformations you might expect to 622 // be legal can fail in the face of NaN/Inf, e.g. 0*x is not always 0. 623 // Float peepholes must pass this equivalence test for all ~4B floats: 624 // 625 // bool equiv(float x, float y) { return (x == y) || (isnanf(x) && isnanf(y)); } 626 // 627 // unsigned bits = 0; 628 // do { 629 // float f; 630 // memcpy(&f, &bits, 4); 631 // if (!equiv(f, ...)) { 632 // abort(); 633 // } 634 // } while (++bits != 0); 635 add(F32 x,F32 y)636 F32 Builder::add(F32 x, F32 y) { 637 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); } 638 if (this->isImm(y.id, 0.0f)) { return x; } // x+0 == x 639 if (this->isImm(x.id, 0.0f)) { return y; } // 0+y == y 640 641 if (fFeatures.fma) { 642 if (fProgram[x.id].op == Op::mul_f32) { 643 return {this, this->push(Op::fma_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)}; 644 } 645 if (fProgram[y.id].op == Op::mul_f32) { 646 return {this, this->push(Op::fma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)}; 647 } 648 } 649 return {this, this->push(Op::add_f32, x.id, y.id)}; 650 } 651 sub(F32 x,F32 y)652 F32 Builder::sub(F32 x, F32 y) { 653 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); } 654 if (this->isImm(y.id, 0.0f)) { return x; } // x-0 == x 655 if (fFeatures.fma) { 656 if (fProgram[x.id].op == Op::mul_f32) { 657 return {this, this->push(Op::fms_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)}; 658 } 659 if (fProgram[y.id].op == Op::mul_f32) { 660 return {this, this->push(Op::fnma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)}; 661 } 662 } 663 return {this, this->push(Op::sub_f32, x.id, y.id)}; 664 } 665 mul(F32 x,F32 y)666 F32 Builder::mul(F32 x, F32 y) { 667 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); } 668 if (this->isImm(y.id, 1.0f)) { return x; } // x*1 == x 669 if (this->isImm(x.id, 1.0f)) { return y; } // 1*y == y 670 return {this, this->push(Op::mul_f32, x.id, y.id)}; 671 } 672 fast_mul(F32 x,F32 y)673 F32 Builder::fast_mul(F32 x, F32 y) { 674 if (this->isImm(x.id, 0.0f) || this->isImm(y.id, 0.0f)) { return splat(0.0f); } 675 return mul(x,y); 676 } 677 div(F32 x,F32 y)678 F32 Builder::div(F32 x, F32 y) { 679 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(sk_ieee_float_divide(X,Y)); } 680 if (this->isImm(y.id, 1.0f)) { return x; } // x/1 == x 681 return {this, this->push(Op::div_f32, x.id, y.id)}; 682 } 683 sqrt(F32 x)684 F32 Builder::sqrt(F32 x) { 685 if (float X; this->allImm(x.id,&X)) { return splat(std::sqrt(X)); } 686 return {this, this->push(Op::sqrt_f32, x.id)}; 687 } 688 689 // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. approx_log2(F32 x)690 F32 Builder::approx_log2(F32 x) { 691 // e - 127 is a fair approximation of log2(x) in its own right... 692 F32 e = mul(to_F32(pun_to_I32(x)), splat(1.0f / (1<<23))); 693 694 // ... but using the mantissa to refine its error is _much_ better. 695 F32 m = pun_to_F32(bit_or(bit_and(pun_to_I32(x), 0x007fffff), 696 0x3f000000)); 697 F32 approx = sub(e, 124.225514990f); 698 approx = sub(approx, mul(1.498030302f, m)); 699 approx = sub(approx, div(1.725879990f, add(0.3520887068f, m))); 700 701 return approx; 702 } 703 approx_pow2(F32 x)704 F32 Builder::approx_pow2(F32 x) { 705 F32 f = fract(x); 706 F32 approx = add(x, 121.274057500f); 707 approx = sub(approx, mul( 1.490129070f, f)); 708 approx = add(approx, div(27.728023300f, sub(4.84252568f, f))); 709 710 return pun_to_F32(round(mul(1.0f * (1<<23), approx))); 711 } 712 approx_powf(F32 x,F32 y)713 F32 Builder::approx_powf(F32 x, F32 y) { 714 // TODO: assert this instead? Sometimes x is very slightly negative. See skia:10210. 715 x = max(0.0f, x); 716 717 auto is_x = bit_or(eq(x, 0.0f), 718 eq(x, 1.0f)); 719 return select(is_x, x, approx_pow2(mul(approx_log2(x), y))); 720 } 721 722 // Bhaskara I's sine approximation 723 // 16x(pi - x) / (5*pi^2 - 4x(pi - x) 724 // ... divide by 4 725 // 4x(pi - x) / 5*pi^2/4 - x(pi - x) 726 // 727 // This is a good approximation only for 0 <= x <= pi, so we use symmetries to get 728 // radians into that range first. 729 // approx_sin(F32 radians)730 F32 Builder::approx_sin(F32 radians) { 731 constexpr float Pi = SK_ScalarPI; 732 // x = radians mod 2pi 733 F32 x = fract(radians * (0.5f/Pi)) * (2*Pi); 734 I32 neg = x > Pi; // are we pi < x < 2pi --> need to negate result 735 x = select(neg, x - Pi, x); 736 737 F32 pair = x * (Pi - x); 738 x = 4.0f * pair / ((5*Pi*Pi/4) - pair); 739 x = select(neg, -x, x); 740 return x; 741 } 742 743 /* "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION" 744 https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf 745 746 approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9 747 748 Some simplifications: 749 1. tan(x) is periodic, -PI/2 < x < PI/2 750 2. tan(x) is odd, so tan(-x) = -tan(x) 751 3. Our polynomial approximation is best near zero, so we use the following identity 752 tan(x) + tan(y) 753 tan(x + y) = ----------------- 754 1 - tan(x)*tan(y) 755 tan(PI/4) = 1 756 757 So for x > PI/8, we do the following refactor: 758 x' = x - PI/4 759 760 1 + tan(x') 761 tan(x) = ------------ 762 1 - tan(x') 763 */ approx_tan(F32 x)764 F32 Builder::approx_tan(F32 x) { 765 constexpr float Pi = SK_ScalarPI; 766 // periodic between -pi/2 ... pi/2 767 // shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back 768 x = fract((1/Pi)*x + 0.5f) * Pi - (Pi/2); 769 770 I32 neg = (x < 0.0f); 771 x = select(neg, -x, x); 772 773 // minimize total error by shifting if x > pi/8 774 I32 use_quotient = (x > (Pi/8)); 775 x = select(use_quotient, x - (Pi/4), x); 776 777 // 9th order poly = 4th order(x^2) * x 778 x = poly(x*x, 62/2835.0f, 17/315.0f, 2/15.0f, 1/3.0f, 1.0f) * x; 779 x = select(use_quotient, (1+x)/(1-x), x); 780 x = select(neg, -x, x); 781 return x; 782 } 783 784 // http://mathforum.org/library/drmath/view/54137.html 785 // referencing Handbook of Mathematical Functions, 786 // by Milton Abramowitz and Irene Stegun approx_asin(F32 x)787 F32 Builder::approx_asin(F32 x) { 788 I32 neg = (x < 0.0f); 789 x = select(neg, -x, x); 790 x = SK_ScalarPI/2 - sqrt(1-x) * poly(x, -0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f); 791 x = select(neg, -x, x); 792 return x; 793 } 794 795 /* Use 4th order polynomial approximation from https://arachnoid.com/polysolve/ 796 * with 129 values of x,atan(x) for x:[0...1] 797 * This only works for 0 <= x <= 1 798 */ approx_atan_unit(F32 x)799 static F32 approx_atan_unit(F32 x) { 800 // for now we might be given NaN... let that through 801 x->assert_true((x != x) | ((x >= 0) & (x <= 1))); 802 return poly(x, 0.14130025741326729f, 803 -0.34312835980675116f, 804 -0.016172900528248768f, 805 1.0037696976200385f, 806 -0.00014758242182738969f); 807 } 808 809 /* Use identity atan(x) = pi/2 - atan(1/x) for x > 1 810 */ approx_atan(F32 x)811 F32 Builder::approx_atan(F32 x) { 812 I32 neg = (x < 0.0f); 813 x = select(neg, -x, x); 814 I32 flip = (x > 1.0f); 815 x = select(flip, 1/x, x); 816 x = approx_atan_unit(x); 817 x = select(flip, SK_ScalarPI/2 - x, x); 818 x = select(neg, -x, x); 819 return x; 820 } 821 822 /* Use identity atan(x) = pi/2 - atan(1/x) for x > 1 823 * By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit() 824 * which avoids a 2nd divide instruction if we had instead called atan(). 825 */ approx_atan2(F32 y0,F32 x0)826 F32 Builder::approx_atan2(F32 y0, F32 x0) { 827 828 I32 flip = (abs(y0) > abs(x0)); 829 F32 y = select(flip, x0, y0); 830 F32 x = select(flip, y0, x0); 831 F32 arg = y/x; 832 833 I32 neg = (arg < 0.0f); 834 arg = select(neg, -arg, arg); 835 836 F32 r = approx_atan_unit(arg); 837 r = select(flip, SK_ScalarPI/2 - r, r); 838 r = select(neg, -r, r); 839 840 // handle quadrant distinctions 841 r = select((y0 >= 0) & (x0 < 0), r + SK_ScalarPI, r); 842 r = select((y0 < 0) & (x0 <= 0), r - SK_ScalarPI, r); 843 // Note: we don't try to handle 0,0 or infinities (yet) 844 return r; 845 } 846 min(F32 x,F32 y)847 F32 Builder::min(F32 x, F32 y) { 848 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::min(X,Y)); } 849 return {this, this->push(Op::min_f32, x.id, y.id)}; 850 } max(F32 x,F32 y)851 F32 Builder::max(F32 x, F32 y) { 852 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::max(X,Y)); } 853 return {this, this->push(Op::max_f32, x.id, y.id)}; 854 } 855 add(I32 x,I32 y)856 I32 Builder::add(I32 x, I32 y) { 857 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); } 858 if (this->isImm(x.id, 0)) { return y; } 859 if (this->isImm(y.id, 0)) { return x; } 860 return {this, this->push(Op::add_i32, x.id, y.id)}; 861 } sub(I32 x,I32 y)862 I32 Builder::sub(I32 x, I32 y) { 863 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); } 864 if (this->isImm(y.id, 0)) { return x; } 865 return {this, this->push(Op::sub_i32, x.id, y.id)}; 866 } mul(I32 x,I32 y)867 I32 Builder::mul(I32 x, I32 y) { 868 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); } 869 if (this->isImm(x.id, 0)) { return splat(0); } 870 if (this->isImm(y.id, 0)) { return splat(0); } 871 if (this->isImm(x.id, 1)) { return y; } 872 if (this->isImm(y.id, 1)) { return x; } 873 return {this, this->push(Op::mul_i32, x.id, y.id)}; 874 } 875 shl(I32 x,int bits)876 I32 Builder::shl(I32 x, int bits) { 877 if (bits == 0) { return x; } 878 if (int X; this->allImm(x.id,&X)) { return splat(X << bits); } 879 return {this, this->push(Op::shl_i32, x.id,NA,NA,NA, bits)}; 880 } shr(I32 x,int bits)881 I32 Builder::shr(I32 x, int bits) { 882 if (bits == 0) { return x; } 883 if (int X; this->allImm(x.id,&X)) { return splat(unsigned(X) >> bits); } 884 return {this, this->push(Op::shr_i32, x.id,NA,NA,NA, bits)}; 885 } sra(I32 x,int bits)886 I32 Builder::sra(I32 x, int bits) { 887 if (bits == 0) { return x; } 888 if (int X; this->allImm(x.id,&X)) { return splat(X >> bits); } 889 return {this, this->push(Op::sra_i32, x.id,NA,NA,NA, bits)}; 890 } 891 eq(F32 x,F32 y)892 I32 Builder:: eq(F32 x, F32 y) { 893 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); } 894 return {this, this->push(Op::eq_f32, x.id, y.id)}; 895 } neq(F32 x,F32 y)896 I32 Builder::neq(F32 x, F32 y) { 897 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); } 898 return {this, this->push(Op::neq_f32, x.id, y.id)}; 899 } lt(F32 x,F32 y)900 I32 Builder::lt(F32 x, F32 y) { 901 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y> X ? ~0 : 0); } 902 return {this, this->push(Op::gt_f32, y.id, x.id)}; 903 } lte(F32 x,F32 y)904 I32 Builder::lte(F32 x, F32 y) { 905 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y>=X ? ~0 : 0); } 906 return {this, this->push(Op::gte_f32, y.id, x.id)}; 907 } gt(F32 x,F32 y)908 I32 Builder::gt(F32 x, F32 y) { 909 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); } 910 return {this, this->push(Op::gt_f32, x.id, y.id)}; 911 } gte(F32 x,F32 y)912 I32 Builder::gte(F32 x, F32 y) { 913 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); } 914 return {this, this->push(Op::gte_f32, x.id, y.id)}; 915 } 916 eq(I32 x,I32 y)917 I32 Builder:: eq(I32 x, I32 y) { 918 if (x.id == y.id) { return splat(~0); } 919 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); } 920 return {this, this->push(Op:: eq_i32, x.id, y.id)}; 921 } neq(I32 x,I32 y)922 I32 Builder::neq(I32 x, I32 y) { 923 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); } 924 return ~(x == y); 925 } gt(I32 x,I32 y)926 I32 Builder:: gt(I32 x, I32 y) { 927 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); } 928 return {this, this->push(Op:: gt_i32, x.id, y.id)}; 929 } gte(I32 x,I32 y)930 I32 Builder::gte(I32 x, I32 y) { 931 if (x.id == y.id) { return splat(~0); } 932 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); } 933 return ~(x < y); 934 } lt(I32 x,I32 y)935 I32 Builder:: lt(I32 x, I32 y) { return y>x; } lte(I32 x,I32 y)936 I32 Builder::lte(I32 x, I32 y) { return y>=x; } 937 bit_and(I32 x,I32 y)938 I32 Builder::bit_and(I32 x, I32 y) { 939 if (x.id == y.id) { return x; } 940 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&Y); } 941 if (this->isImm(y.id, 0)) { return splat(0); } // (x & false) == false 942 if (this->isImm(x.id, 0)) { return splat(0); } // (false & y) == false 943 if (this->isImm(y.id,~0)) { return x; } // (x & true) == x 944 if (this->isImm(x.id,~0)) { return y; } // (true & y) == y 945 return {this, this->push(Op::bit_and, x.id, y.id)}; 946 } bit_or(I32 x,I32 y)947 I32 Builder::bit_or(I32 x, I32 y) { 948 if (x.id == y.id) { return x; } 949 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|Y); } 950 if (this->isImm(y.id, 0)) { return x; } // (x | false) == x 951 if (this->isImm(x.id, 0)) { return y; } // (false | y) == y 952 if (this->isImm(y.id,~0)) { return splat(~0); } // (x | true) == true 953 if (this->isImm(x.id,~0)) { return splat(~0); } // (true | y) == true 954 return {this, this->push(Op::bit_or, x.id, y.id)}; 955 } bit_xor(I32 x,I32 y)956 I32 Builder::bit_xor(I32 x, I32 y) { 957 if (x.id == y.id) { return splat(0); } 958 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X^Y); } 959 if (this->isImm(y.id, 0)) { return x; } // (x ^ false) == x 960 if (this->isImm(x.id, 0)) { return y; } // (false ^ y) == y 961 return {this, this->push(Op::bit_xor, x.id, y.id)}; 962 } 963 bit_clear(I32 x,I32 y)964 I32 Builder::bit_clear(I32 x, I32 y) { 965 if (x.id == y.id) { return splat(0); } 966 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&~Y); } 967 if (this->isImm(y.id, 0)) { return x; } // (x & ~false) == x 968 if (this->isImm(y.id,~0)) { return splat(0); } // (x & ~true) == false 969 if (this->isImm(x.id, 0)) { return splat(0); } // (false & ~y) == false 970 return {this, this->push(Op::bit_clear, x.id, y.id)}; 971 } 972 select(I32 x,I32 y,I32 z)973 I32 Builder::select(I32 x, I32 y, I32 z) { 974 if (y.id == z.id) { return y; } 975 if (int X,Y,Z; this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return splat(X?Y:Z); } 976 if (this->isImm(x.id,~0)) { return y; } // true ? y : z == y 977 if (this->isImm(x.id, 0)) { return z; } // false ? y : z == z 978 if (this->isImm(y.id, 0)) { return bit_clear(z,x); } // x ? 0 : z == ~x&z 979 if (this->isImm(z.id, 0)) { return bit_and (y,x); } // x ? y : 0 == x&y 980 return {this, this->push(Op::select, x.id, y.id, z.id)}; 981 } 982 extract(I32 x,int bits,I32 z)983 I32 Builder::extract(I32 x, int bits, I32 z) { 984 if (unsigned Z; this->allImm(z.id,&Z) && (~0u>>bits) == Z) { return shr(x, bits); } 985 return bit_and(z, shr(x, bits)); 986 } 987 pack(I32 x,I32 y,int bits)988 I32 Builder::pack(I32 x, I32 y, int bits) { 989 return bit_or(x, shl(y, bits)); 990 } 991 ceil(F32 x)992 F32 Builder::ceil(F32 x) { 993 if (float X; this->allImm(x.id,&X)) { return splat(ceilf(X)); } 994 return {this, this->push(Op::ceil, x.id)}; 995 } floor(F32 x)996 F32 Builder::floor(F32 x) { 997 if (float X; this->allImm(x.id,&X)) { return splat(floorf(X)); } 998 return {this, this->push(Op::floor, x.id)}; 999 } to_F32(I32 x)1000 F32 Builder::to_F32(I32 x) { 1001 if (int X; this->allImm(x.id,&X)) { return splat((float)X); } 1002 return {this, this->push(Op::to_f32, x.id)}; 1003 } trunc(F32 x)1004 I32 Builder::trunc(F32 x) { 1005 if (float X; this->allImm(x.id,&X)) { return splat((int)X); } 1006 return {this, this->push(Op::trunc, x.id)}; 1007 } round(F32 x)1008 I32 Builder::round(F32 x) { 1009 if (float X; this->allImm(x.id,&X)) { return splat((int)lrintf(X)); } 1010 return {this, this->push(Op::round, x.id)}; 1011 } 1012 to_fp16(F32 x)1013 I32 Builder::to_fp16(F32 x) { 1014 if (float X; this->allImm(x.id,&X)) { return splat((int)SkFloatToHalf(X)); } 1015 return {this, this->push(Op::to_fp16, x.id)}; 1016 } from_fp16(I32 x)1017 F32 Builder::from_fp16(I32 x) { 1018 if (int X; this->allImm(x.id,&X)) { return splat(SkHalfToFloat(X)); } 1019 return {this, this->push(Op::from_fp16, x.id)}; 1020 } 1021 from_unorm(int bits,I32 x)1022 F32 Builder::from_unorm(int bits, I32 x) { 1023 F32 limit = splat(1 / ((1<<bits)-1.0f)); 1024 return mul(to_F32(x), limit); 1025 } to_unorm(int bits,F32 x)1026 I32 Builder::to_unorm(int bits, F32 x) { 1027 F32 limit = splat((1<<bits)-1.0f); 1028 return round(mul(x, limit)); 1029 } 1030 SkColorType_to_PixelFormat(SkColorType ct)1031 PixelFormat SkColorType_to_PixelFormat(SkColorType ct) { 1032 auto UNORM = PixelFormat::UNORM, 1033 FLOAT = PixelFormat::FLOAT; 1034 switch (ct) { 1035 case kUnknown_SkColorType: break; 1036 1037 case kRGBA_F32_SkColorType: return {FLOAT,32,32,32,32, 0,32,64,96}; 1038 1039 case kRGBA_F16Norm_SkColorType: return {FLOAT,16,16,16,16, 0,16,32,48}; 1040 case kRGBA_F16_SkColorType: return {FLOAT,16,16,16,16, 0,16,32,48}; 1041 case kR16G16B16A16_unorm_SkColorType: return {UNORM,16,16,16,16, 0,16,32,48}; 1042 1043 case kA16_float_SkColorType: return {FLOAT, 0, 0,0,16, 0, 0,0,0}; 1044 case kR16G16_float_SkColorType: return {FLOAT, 16,16,0, 0, 0,16,0,0}; 1045 1046 case kAlpha_8_SkColorType: return {UNORM, 0,0,0,8, 0,0,0,0}; 1047 case kGray_8_SkColorType: return {UNORM, 8,8,8,0, 0,0,0,0}; // Subtle. 1048 1049 case kRGB_565_SkColorType: return {UNORM, 5,6,5,0, 11,5,0,0}; // (BGR) 1050 case kARGB_4444_SkColorType: return {UNORM, 4,4,4,4, 12,8,4,0}; // (ABGR) 1051 1052 case kRGBA_8888_SkColorType: return {UNORM, 8,8,8,8, 0,8,16,24}; 1053 case kRGB_888x_SkColorType: return {UNORM, 8,8,8,0, 0,8,16,32}; // 32-bit 1054 case kBGRA_8888_SkColorType: return {UNORM, 8,8,8,8, 16,8, 0,24}; 1055 1056 case kRGBA_1010102_SkColorType: return {UNORM, 10,10,10,2, 0,10,20,30}; 1057 case kBGRA_1010102_SkColorType: return {UNORM, 10,10,10,2, 20,10, 0,30}; 1058 case kRGB_101010x_SkColorType: return {UNORM, 10,10,10,0, 0,10,20, 0}; 1059 case kBGR_101010x_SkColorType: return {UNORM, 10,10,10,0, 20,10, 0, 0}; 1060 1061 case kR8G8_unorm_SkColorType: return {UNORM, 8, 8,0, 0, 0, 8,0,0}; 1062 case kR16G16_unorm_SkColorType: return {UNORM, 16,16,0, 0, 0,16,0,0}; 1063 case kA16_unorm_SkColorType: return {UNORM, 0, 0,0,16, 0, 0,0,0}; 1064 } 1065 SkASSERT(false); 1066 return {UNORM, 0,0,0,0, 0,0,0,0}; 1067 } 1068 byte_size(PixelFormat f)1069 static int byte_size(PixelFormat f) { 1070 // What's the highest bit we read? 1071 int bits = std::max(f.r_bits + f.r_shift, 1072 std::max(f.g_bits + f.g_shift, 1073 std::max(f.b_bits + f.b_shift, 1074 f.a_bits + f.a_shift))); 1075 // Round up to bytes. 1076 return (bits + 7) / 8; 1077 } 1078 unpack(PixelFormat f,I32 x)1079 static Color unpack(PixelFormat f, I32 x) { 1080 SkASSERT(byte_size(f) <= 4); 1081 auto unpack_channel = [=](int bits, int shift) { 1082 I32 channel = extract(x, shift, (1<<bits)-1); 1083 switch (f.encoding) { 1084 case PixelFormat::UNORM: return from_unorm(bits, channel); 1085 case PixelFormat::FLOAT: return from_fp16 ( channel); 1086 } 1087 SkUNREACHABLE; 1088 }; 1089 return { 1090 f.r_bits ? unpack_channel(f.r_bits, f.r_shift) : x->splat(0.0f), 1091 f.g_bits ? unpack_channel(f.g_bits, f.g_shift) : x->splat(0.0f), 1092 f.b_bits ? unpack_channel(f.b_bits, f.b_shift) : x->splat(0.0f), 1093 f.a_bits ? unpack_channel(f.a_bits, f.a_shift) : x->splat(1.0f), 1094 }; 1095 } 1096 split_disjoint_8byte_format(PixelFormat f,PixelFormat * lo,PixelFormat * hi)1097 static void split_disjoint_8byte_format(PixelFormat f, PixelFormat* lo, PixelFormat* hi) { 1098 SkASSERT(byte_size(f) == 8); 1099 // We assume some of the channels are in the low 32 bits, some in the high 32 bits. 1100 // The assert on byte_size(lo) will trigger if this assumption is violated. 1101 *lo = f; 1102 if (f.r_shift >= 32) { lo->r_bits = 0; lo->r_shift = 32; } 1103 if (f.g_shift >= 32) { lo->g_bits = 0; lo->g_shift = 32; } 1104 if (f.b_shift >= 32) { lo->b_bits = 0; lo->b_shift = 32; } 1105 if (f.a_shift >= 32) { lo->a_bits = 0; lo->a_shift = 32; } 1106 SkASSERT(byte_size(*lo) == 4); 1107 1108 *hi = f; 1109 if (f.r_shift < 32) { hi->r_bits = 0; hi->r_shift = 32; } else { hi->r_shift -= 32; } 1110 if (f.g_shift < 32) { hi->g_bits = 0; hi->g_shift = 32; } else { hi->g_shift -= 32; } 1111 if (f.b_shift < 32) { hi->b_bits = 0; hi->b_shift = 32; } else { hi->b_shift -= 32; } 1112 if (f.a_shift < 32) { hi->a_bits = 0; hi->a_shift = 32; } else { hi->a_shift -= 32; } 1113 SkASSERT(byte_size(*hi) == 4); 1114 } 1115 1116 // The only 16-byte format we support today is RGBA F32, 1117 // though, TODO, we could generalize that to any swizzle, and to allow UNORM too. assert_16byte_is_rgba_f32(PixelFormat f)1118 static void assert_16byte_is_rgba_f32(PixelFormat f) { 1119 #if defined(SK_DEBUG) 1120 SkASSERT(byte_size(f) == 16); 1121 PixelFormat rgba_f32 = SkColorType_to_PixelFormat(kRGBA_F32_SkColorType); 1122 1123 SkASSERT(f.encoding == rgba_f32.encoding); 1124 1125 SkASSERT(f.r_bits == rgba_f32.r_bits); 1126 SkASSERT(f.g_bits == rgba_f32.g_bits); 1127 SkASSERT(f.b_bits == rgba_f32.b_bits); 1128 SkASSERT(f.a_bits == rgba_f32.a_bits); 1129 1130 SkASSERT(f.r_shift == rgba_f32.r_shift); 1131 SkASSERT(f.g_shift == rgba_f32.g_shift); 1132 SkASSERT(f.b_shift == rgba_f32.b_shift); 1133 SkASSERT(f.a_shift == rgba_f32.a_shift); 1134 #endif 1135 } 1136 load(PixelFormat f,Ptr ptr)1137 Color Builder::load(PixelFormat f, Ptr ptr) { 1138 switch (byte_size(f)) { 1139 case 1: return unpack(f, load8 (ptr)); 1140 case 2: return unpack(f, load16(ptr)); 1141 case 4: return unpack(f, load32(ptr)); 1142 case 8: { 1143 PixelFormat lo,hi; 1144 split_disjoint_8byte_format(f, &lo,&hi); 1145 Color l = unpack(lo, load64(ptr, 0)), 1146 h = unpack(hi, load64(ptr, 1)); 1147 return { 1148 lo.r_bits ? l.r : h.r, 1149 lo.g_bits ? l.g : h.g, 1150 lo.b_bits ? l.b : h.b, 1151 lo.a_bits ? l.a : h.a, 1152 }; 1153 } 1154 case 16: { 1155 assert_16byte_is_rgba_f32(f); 1156 return { 1157 pun_to_F32(load128(ptr, 0)), 1158 pun_to_F32(load128(ptr, 1)), 1159 pun_to_F32(load128(ptr, 2)), 1160 pun_to_F32(load128(ptr, 3)), 1161 }; 1162 } 1163 default: SkUNREACHABLE; 1164 } 1165 return {}; 1166 } 1167 gather(PixelFormat f,Ptr ptr,int offset,I32 index)1168 Color Builder::gather(PixelFormat f, Ptr ptr, int offset, I32 index) { 1169 switch (byte_size(f)) { 1170 case 1: return unpack(f, gather8 (ptr, offset, index)); 1171 case 2: return unpack(f, gather16(ptr, offset, index)); 1172 case 4: return unpack(f, gather32(ptr, offset, index)); 1173 case 8: { 1174 PixelFormat lo,hi; 1175 split_disjoint_8byte_format(f, &lo,&hi); 1176 Color l = unpack(lo, gather32(ptr, offset, (index<<1)+0)), 1177 h = unpack(hi, gather32(ptr, offset, (index<<1)+1)); 1178 return { 1179 lo.r_bits ? l.r : h.r, 1180 lo.g_bits ? l.g : h.g, 1181 lo.b_bits ? l.b : h.b, 1182 lo.a_bits ? l.a : h.a, 1183 }; 1184 } 1185 case 16: { 1186 assert_16byte_is_rgba_f32(f); 1187 return { 1188 gatherF(ptr, offset, (index<<2)+0), 1189 gatherF(ptr, offset, (index<<2)+1), 1190 gatherF(ptr, offset, (index<<2)+2), 1191 gatherF(ptr, offset, (index<<2)+3), 1192 }; 1193 } 1194 default: SkUNREACHABLE; 1195 } 1196 return {}; 1197 } 1198 pack32(PixelFormat f,Color c)1199 static I32 pack32(PixelFormat f, Color c) { 1200 SkASSERT(byte_size(f) <= 4); 1201 I32 packed = c->splat(0); 1202 auto pack_channel = [&](F32 channel, int bits, int shift) { 1203 I32 encoded; 1204 switch (f.encoding) { 1205 case PixelFormat::UNORM: encoded = to_unorm(bits, channel); break; 1206 case PixelFormat::FLOAT: encoded = to_fp16 ( channel); break; 1207 } 1208 packed = pack(packed, encoded, shift); 1209 }; 1210 if (f.r_bits) { pack_channel(c.r, f.r_bits, f.r_shift); } 1211 if (f.g_bits) { pack_channel(c.g, f.g_bits, f.g_shift); } 1212 if (f.b_bits) { pack_channel(c.b, f.b_bits, f.b_shift); } 1213 if (f.a_bits) { pack_channel(c.a, f.a_bits, f.a_shift); } 1214 return packed; 1215 } 1216 store(PixelFormat f,Ptr ptr,Color c)1217 void Builder::store(PixelFormat f, Ptr ptr, Color c) { 1218 // Detect a grayscale PixelFormat: r,g,b bit counts and shifts all equal. 1219 if (f.r_bits == f.g_bits && f.g_bits == f.b_bits && 1220 f.r_shift == f.g_shift && f.g_shift == f.b_shift) { 1221 1222 // TODO: pull these coefficients from an SkColorSpace? This is sRGB luma/luminance. 1223 c.r = c.r * 0.2126f 1224 + c.g * 0.7152f 1225 + c.b * 0.0722f; 1226 f.g_bits = f.b_bits = 0; 1227 } 1228 1229 switch (byte_size(f)) { 1230 case 1: store8 (ptr, pack32(f,c)); break; 1231 case 2: store16(ptr, pack32(f,c)); break; 1232 case 4: store32(ptr, pack32(f,c)); break; 1233 case 8: { 1234 PixelFormat lo,hi; 1235 split_disjoint_8byte_format(f, &lo,&hi); 1236 store64(ptr, pack32(lo,c) 1237 , pack32(hi,c)); 1238 break; 1239 } 1240 case 16: { 1241 assert_16byte_is_rgba_f32(f); 1242 store128(ptr, pun_to_I32(c.r), pun_to_I32(c.g), pun_to_I32(c.b), pun_to_I32(c.a)); 1243 break; 1244 } 1245 default: SkUNREACHABLE; 1246 } 1247 } 1248 unpremul(F32 * r,F32 * g,F32 * b,F32 a)1249 void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) { 1250 skvm::F32 invA = 1.0f / a, 1251 inf = pun_to_F32(splat(0x7f800000)); 1252 // If a is 0, so are *r,*g,*b, so set invA to 0 to avoid 0*inf=NaN (instead 0*0 = 0). 1253 invA = select(invA < inf, invA 1254 , 0.0f); 1255 *r *= invA; 1256 *g *= invA; 1257 *b *= invA; 1258 } 1259 premul(F32 * r,F32 * g,F32 * b,F32 a)1260 void Builder::premul(F32* r, F32* g, F32* b, F32 a) { 1261 *r *= a; 1262 *g *= a; 1263 *b *= a; 1264 } 1265 uniformColor(SkColor4f color,Uniforms * uniforms)1266 Color Builder::uniformColor(SkColor4f color, Uniforms* uniforms) { 1267 auto [r,g,b,a] = color; 1268 return { 1269 uniformF(uniforms->pushF(r)), 1270 uniformF(uniforms->pushF(g)), 1271 uniformF(uniforms->pushF(b)), 1272 uniformF(uniforms->pushF(a)), 1273 }; 1274 } 1275 lerp(F32 lo,F32 hi,F32 t)1276 F32 Builder::lerp(F32 lo, F32 hi, F32 t) { 1277 if (this->isImm(t.id, 0.0f)) { return lo; } 1278 if (this->isImm(t.id, 1.0f)) { return hi; } 1279 return mad(sub(hi, lo), t, lo); 1280 } 1281 lerp(Color lo,Color hi,F32 t)1282 Color Builder::lerp(Color lo, Color hi, F32 t) { 1283 return { 1284 lerp(lo.r, hi.r, t), 1285 lerp(lo.g, hi.g, t), 1286 lerp(lo.b, hi.b, t), 1287 lerp(lo.a, hi.a, t), 1288 }; 1289 } 1290 to_hsla(Color c)1291 HSLA Builder::to_hsla(Color c) { 1292 F32 mx = max(max(c.r,c.g),c.b), 1293 mn = min(min(c.r,c.g),c.b), 1294 d = mx - mn, 1295 invd = 1.0f / d, 1296 g_lt_b = select(c.g < c.b, splat(6.0f) 1297 , splat(0.0f)); 1298 1299 F32 h = (1/6.0f) * select(mx == mn, 0.0f, 1300 select(mx == c.r, invd * (c.g - c.b) + g_lt_b, 1301 select(mx == c.g, invd * (c.b - c.r) + 2.0f 1302 , invd * (c.r - c.g) + 4.0f))); 1303 1304 F32 sum = mx + mn, 1305 l = sum * 0.5f, 1306 s = select(mx == mn, 0.0f 1307 , d / select(l > 0.5f, 2.0f - sum 1308 , sum)); 1309 return {h, s, l, c.a}; 1310 } 1311 to_rgba(HSLA c)1312 Color Builder::to_rgba(HSLA c) { 1313 // See GrRGBToHSLFilterEffect.fp 1314 1315 auto [h,s,l,a] = c; 1316 F32 x = s * (1.0f - abs(l + l - 1.0f)); 1317 1318 auto hue_to_rgb = [&,l=l](auto hue) { 1319 auto q = abs(6.0f * fract(hue) - 3.0f) - 1.0f; 1320 return x * (clamp01(q) - 0.5f) + l; 1321 }; 1322 1323 return { 1324 hue_to_rgb(h + 0/3.0f), 1325 hue_to_rgb(h + 2/3.0f), 1326 hue_to_rgb(h + 1/3.0f), 1327 c.a, 1328 }; 1329 } 1330 1331 // We're basing our implementation of non-separable blend modes on 1332 // https://www.w3.org/TR/compositing-1/#blendingnonseparable. 1333 // and 1334 // https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf 1335 // They're equivalent, but ES' math has been better simplified. 1336 // 1337 // Anything extra we add beyond that is to make the math work with premul inputs. 1338 saturation(skvm::F32 r,skvm::F32 g,skvm::F32 b)1339 static skvm::F32 saturation(skvm::F32 r, skvm::F32 g, skvm::F32 b) { 1340 return max(r, max(g, b)) 1341 - min(r, min(g, b)); 1342 } 1343 luminance(skvm::F32 r,skvm::F32 g,skvm::F32 b)1344 static skvm::F32 luminance(skvm::F32 r, skvm::F32 g, skvm::F32 b) { 1345 return r*0.30f + g*0.59f + b*0.11f; 1346 } 1347 set_sat(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 s)1348 static void set_sat(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) { 1349 F32 mn = min(*r, min(*g, *b)), 1350 mx = max(*r, max(*g, *b)), 1351 sat = mx - mn; 1352 1353 // Map min channel to 0, max channel to s, and scale the middle proportionally. 1354 auto scale = [&](skvm::F32 c) { 1355 auto scaled = ((c - mn) * s) / sat; 1356 return select(is_finite(scaled), scaled, 0.0f); 1357 }; 1358 *r = scale(*r); 1359 *g = scale(*g); 1360 *b = scale(*b); 1361 } 1362 set_lum(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 lu)1363 static void set_lum(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) { 1364 auto diff = lu - luminance(*r, *g, *b); 1365 *r += diff; 1366 *g += diff; 1367 *b += diff; 1368 } 1369 clip_color(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 a)1370 static void clip_color(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) { 1371 F32 mn = min(*r, min(*g, *b)), 1372 mx = max(*r, max(*g, *b)), 1373 lu = luminance(*r, *g, *b); 1374 1375 auto clip = [&](auto c) { 1376 c = select(mn >= 0, c 1377 , lu + ((c-lu)*( lu)) / (lu-mn)); 1378 c = select(mx > a, lu + ((c-lu)*(a-lu)) / (mx-lu) 1379 , c); 1380 return clamp01(c); // May be a little negative, or worse, NaN. 1381 }; 1382 *r = clip(*r); 1383 *g = clip(*g); 1384 *b = clip(*b); 1385 } 1386 blend(SkBlendMode mode,Color src,Color dst)1387 Color Builder::blend(SkBlendMode mode, Color src, Color dst) { 1388 auto mma = [](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) { 1389 return x*y + z*w; 1390 }; 1391 1392 auto two = [](skvm::F32 x) { return x+x; }; 1393 1394 auto apply_rgba = [&](auto fn) { 1395 return Color { 1396 fn(src.r, dst.r), 1397 fn(src.g, dst.g), 1398 fn(src.b, dst.b), 1399 fn(src.a, dst.a), 1400 }; 1401 }; 1402 1403 auto apply_rgb_srcover_a = [&](auto fn) { 1404 return Color { 1405 fn(src.r, dst.r), 1406 fn(src.g, dst.g), 1407 fn(src.b, dst.b), 1408 mad(dst.a, 1-src.a, src.a), // srcover for alpha 1409 }; 1410 }; 1411 1412 auto non_sep = [&](auto R, auto G, auto B) { 1413 return Color{ 1414 R + mma(src.r, 1-dst.a, dst.r, 1-src.a), 1415 G + mma(src.g, 1-dst.a, dst.g, 1-src.a), 1416 B + mma(src.b, 1-dst.a, dst.b, 1-src.a), 1417 mad(dst.a, 1-src.a, src.a), // srcover for alpha 1418 }; 1419 }; 1420 1421 switch (mode) { 1422 default: 1423 SkASSERT(false); 1424 [[fallthrough]]; /*but also, for safety, fallthrough*/ 1425 1426 case SkBlendMode::kClear: return { splat(0.0f), splat(0.0f), splat(0.0f), splat(0.0f) }; 1427 1428 case SkBlendMode::kSrc: return src; 1429 case SkBlendMode::kDst: return dst; 1430 1431 case SkBlendMode::kDstOver: std::swap(src, dst); [[fallthrough]]; 1432 case SkBlendMode::kSrcOver: 1433 return apply_rgba([&](auto s, auto d) { 1434 return mad(d,1-src.a, s); 1435 }); 1436 1437 case SkBlendMode::kDstIn: std::swap(src, dst); [[fallthrough]]; 1438 case SkBlendMode::kSrcIn: 1439 return apply_rgba([&](auto s, auto d) { 1440 return s * dst.a; 1441 }); 1442 1443 case SkBlendMode::kDstOut: std::swap(src, dst); [[fallthrough]]; 1444 1445 case SkBlendMode::kSrcOut: 1446 return apply_rgba([&](auto s, auto d) { 1447 return s * (1-dst.a); 1448 }); 1449 1450 case SkBlendMode::kDstATop: std::swap(src, dst); [[fallthrough]]; 1451 case SkBlendMode::kSrcATop: 1452 return apply_rgba([&](auto s, auto d) { 1453 return mma(s, dst.a, d, 1-src.a); 1454 }); 1455 1456 case SkBlendMode::kXor: 1457 return apply_rgba([&](auto s, auto d) { 1458 return mma(s, 1-dst.a, d, 1-src.a); 1459 }); 1460 1461 case SkBlendMode::kPlus: 1462 return apply_rgba([&](auto s, auto d) { 1463 return min(s+d, 1.0f); 1464 }); 1465 1466 case SkBlendMode::kModulate: 1467 return apply_rgba([&](auto s, auto d) { 1468 return s * d; 1469 }); 1470 1471 case SkBlendMode::kScreen: 1472 // (s+d)-(s*d) gave us trouble with our "r,g,b <= after blending" asserts. 1473 // It's kind of plausible that s + (d - sd) keeps more precision? 1474 return apply_rgba([&](auto s, auto d) { 1475 return s + (d - s*d); 1476 }); 1477 1478 case SkBlendMode::kDarken: 1479 return apply_rgb_srcover_a([&](auto s, auto d) { 1480 return s + (d - max(s * dst.a, 1481 d * src.a)); 1482 }); 1483 1484 case SkBlendMode::kLighten: 1485 return apply_rgb_srcover_a([&](auto s, auto d) { 1486 return s + (d - min(s * dst.a, 1487 d * src.a)); 1488 }); 1489 1490 case SkBlendMode::kDifference: 1491 return apply_rgb_srcover_a([&](auto s, auto d) { 1492 return s + (d - two(min(s * dst.a, 1493 d * src.a))); 1494 }); 1495 1496 case SkBlendMode::kExclusion: 1497 return apply_rgb_srcover_a([&](auto s, auto d) { 1498 return s + (d - two(s * d)); 1499 }); 1500 1501 case SkBlendMode::kColorBurn: 1502 return apply_rgb_srcover_a([&](auto s, auto d) { 1503 auto mn = min(dst.a, 1504 src.a * (dst.a - d) / s), 1505 burn = src.a * (dst.a - mn) + mma(s, 1-dst.a, d, 1-src.a); 1506 return select(d == dst.a , s * (1-dst.a) + d, 1507 select(is_finite(burn), burn 1508 , d * (1-src.a) + s)); 1509 }); 1510 1511 case SkBlendMode::kColorDodge: 1512 return apply_rgb_srcover_a([&](auto s, auto d) { 1513 auto dodge = src.a * min(dst.a, 1514 d * src.a / (src.a - s)) 1515 + mma(s, 1-dst.a, d, 1-src.a); 1516 return select(d == 0.0f , s * (1-dst.a) + d, 1517 select(is_finite(dodge), dodge 1518 , d * (1-src.a) + s)); 1519 }); 1520 1521 case SkBlendMode::kHardLight: 1522 return apply_rgb_srcover_a([&](auto s, auto d) { 1523 return mma(s, 1-dst.a, d, 1-src.a) + 1524 select(two(s) <= src.a, 1525 two(s * d), 1526 src.a * dst.a - two((dst.a - d) * (src.a - s))); 1527 }); 1528 1529 case SkBlendMode::kOverlay: 1530 return apply_rgb_srcover_a([&](auto s, auto d) { 1531 return mma(s, 1-dst.a, d, 1-src.a) + 1532 select(two(d) <= dst.a, 1533 two(s * d), 1534 src.a * dst.a - two((dst.a - d) * (src.a - s))); 1535 }); 1536 1537 case SkBlendMode::kMultiply: 1538 return apply_rgba([&](auto s, auto d) { 1539 return mma(s, 1-dst.a, d, 1-src.a) + s * d; 1540 }); 1541 1542 case SkBlendMode::kSoftLight: 1543 return apply_rgb_srcover_a([&](auto s, auto d) { 1544 auto m = select(dst.a > 0.0f, d / dst.a 1545 , 0.0f), 1546 s2 = two(s), 1547 m4 = 4*m; 1548 1549 // The logic forks three ways: 1550 // 1. dark src? 1551 // 2. light src, dark dst? 1552 // 3. light src, light dst? 1553 1554 // Used in case 1 1555 auto darkSrc = d * ((s2-src.a) * (1-m) + src.a), 1556 // Used in case 2 1557 darkDst = (m4 * m4 + m4) * (m-1) + 7*m, 1558 // Used in case 3. 1559 liteDst = sqrt(m) - m, 1560 // Used in 2 or 3? 1561 liteSrc = dst.a * (s2 - src.a) * select(4*d <= dst.a, darkDst 1562 , liteDst) 1563 + d * src.a; 1564 return s * (1-dst.a) + d * (1-src.a) + select(s2 <= src.a, darkSrc 1565 , liteSrc); 1566 }); 1567 1568 case SkBlendMode::kHue: { 1569 skvm::F32 R = src.r * src.a, 1570 G = src.g * src.a, 1571 B = src.b * src.a; 1572 1573 set_sat (&R, &G, &B, src.a * saturation(dst.r, dst.g, dst.b)); 1574 set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b)); 1575 clip_color(&R, &G, &B, src.a * dst.a); 1576 1577 return non_sep(R, G, B); 1578 } 1579 1580 case SkBlendMode::kSaturation: { 1581 skvm::F32 R = dst.r * src.a, 1582 G = dst.g * src.a, 1583 B = dst.b * src.a; 1584 1585 set_sat (&R, &G, &B, dst.a * saturation(src.r, src.g, src.b)); 1586 set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b)); 1587 clip_color(&R, &G, &B, src.a * dst.a); 1588 1589 return non_sep(R, G, B); 1590 } 1591 1592 case SkBlendMode::kColor: { 1593 skvm::F32 R = src.r * dst.a, 1594 G = src.g * dst.a, 1595 B = src.b * dst.a; 1596 1597 set_lum (&R, &G, &B, src.a * luminance(dst.r, dst.g, dst.b)); 1598 clip_color(&R, &G, &B, src.a * dst.a); 1599 1600 return non_sep(R, G, B); 1601 } 1602 1603 case SkBlendMode::kLuminosity: { 1604 skvm::F32 R = dst.r * src.a, 1605 G = dst.g * src.a, 1606 B = dst.b * src.a; 1607 1608 set_lum (&R, &G, &B, dst.a * luminance(src.r, src.g, src.b)); 1609 clip_color(&R, &G, &B, dst.a * src.a); 1610 1611 return non_sep(R, G, B); 1612 } 1613 } 1614 } 1615 1616 // ~~~~ Program::eval() and co. ~~~~ // 1617 1618 // Handy references for x86-64 instruction encoding: 1619 // https://wiki.osdev.org/X86-64_Instruction_Encoding 1620 // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm 1621 // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm 1622 // http://ref.x86asm.net/coder64.html 1623 1624 // Used for ModRM / immediate instruction encoding. _233(int a,int b,int c)1625 static uint8_t _233(int a, int b, int c) { 1626 return (a & 3) << 6 1627 | (b & 7) << 3 1628 | (c & 7) << 0; 1629 } 1630 1631 // ModRM byte encodes the arguments of an opcode. 1632 enum class Mod { Indirect, OneByteImm, FourByteImm, Direct }; mod_rm(Mod mod,int reg,int rm)1633 static uint8_t mod_rm(Mod mod, int reg, int rm) { 1634 return _233((int)mod, reg, rm); 1635 } 1636 mod(int imm)1637 static Mod mod(int imm) { 1638 if (imm == 0) { return Mod::Indirect; } 1639 if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; } 1640 return Mod::FourByteImm; 1641 } 1642 imm_bytes(Mod mod)1643 static int imm_bytes(Mod mod) { 1644 switch (mod) { 1645 case Mod::Indirect: return 0; 1646 case Mod::OneByteImm: return 1; 1647 case Mod::FourByteImm: return 4; 1648 case Mod::Direct: SkUNREACHABLE; 1649 } 1650 SkUNREACHABLE; 1651 } 1652 1653 // SIB byte encodes a memory address, base + (index * scale). sib(Assembler::Scale scale,int index,int base)1654 static uint8_t sib(Assembler::Scale scale, int index, int base) { 1655 return _233((int)scale, index, base); 1656 } 1657 1658 // The REX prefix is used to extend most old 32-bit instructions to 64-bit. rex(bool W,bool R,bool X,bool B)1659 static uint8_t rex(bool W, // If set, operation is 64-bit, otherwise default, usually 32-bit. 1660 bool R, // Extra top bit to select ModRM reg, registers 8-15. 1661 bool X, // Extra top bit for SIB index register. 1662 bool B) { // Extra top bit for SIB base or ModRM rm register. 1663 return 0b01000000 // Fixed 0100 for top four bits. 1664 | (W << 3) 1665 | (R << 2) 1666 | (X << 1) 1667 | (B << 0); 1668 } 1669 1670 1671 // The VEX prefix extends SSE operations to AVX. Used generally, even with XMM. 1672 struct VEX { 1673 int len; 1674 uint8_t bytes[3]; 1675 }; 1676 vex(bool WE,bool R,bool X,bool B,int map,int vvvv,bool L,int pp)1677 static VEX vex(bool WE, // Like REX W for int operations, or opcode extension for float? 1678 bool R, // Same as REX R. Pass high bit of dst register, dst>>3. 1679 bool X, // Same as REX X. 1680 bool B, // Same as REX B. Pass y>>3 for 3-arg ops, x>>3 for 2-arg. 1681 int map, // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f. 1682 int vvvv, // 4-bit second operand register. Pass our x for 3-arg ops. 1683 bool L, // Set for 256-bit ymm operations, off for 128-bit xmm. 1684 int pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none. 1685 1686 // Pack x86 opcode map selector to 5-bit VEX encoding. 1687 map = [map]{ 1688 switch (map) { 1689 case 0x0f: return 0b00001; 1690 case 0x380f: return 0b00010; 1691 case 0x3a0f: return 0b00011; 1692 // Several more cases only used by XOP / TBM. 1693 } 1694 SkUNREACHABLE; 1695 }(); 1696 1697 // Pack mandatory SSE opcode prefix byte to 2-bit VEX encoding. 1698 pp = [pp]{ 1699 switch (pp) { 1700 case 0x66: return 0b01; 1701 case 0xf3: return 0b10; 1702 case 0xf2: return 0b11; 1703 } 1704 return 0b00; 1705 }(); 1706 1707 VEX vex = {0, {0,0,0}}; 1708 if (X == 0 && B == 0 && WE == 0 && map == 0b00001) { 1709 // With these conditions met, we can optionally compress VEX to 2-byte. 1710 vex.len = 2; 1711 vex.bytes[0] = 0xc5; 1712 vex.bytes[1] = (pp & 3) << 0 1713 | (L & 1) << 2 1714 | (~vvvv & 15) << 3 1715 | (~(int)R & 1) << 7; 1716 } else { 1717 // We could use this 3-byte VEX prefix all the time if we like. 1718 vex.len = 3; 1719 vex.bytes[0] = 0xc4; 1720 vex.bytes[1] = (map & 31) << 0 1721 | (~(int)B & 1) << 5 1722 | (~(int)X & 1) << 6 1723 | (~(int)R & 1) << 7; 1724 vex.bytes[2] = (pp & 3) << 0 1725 | (L & 1) << 2 1726 | (~vvvv & 15) << 3 1727 | (WE & 1) << 7; 1728 } 1729 return vex; 1730 } 1731 Assembler(void * buf)1732 Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fSize(0) {} 1733 size() const1734 size_t Assembler::size() const { return fSize; } 1735 bytes(const void * p,int n)1736 void Assembler::bytes(const void* p, int n) { 1737 if (fCode) { 1738 memcpy(fCode+fSize, p, n); 1739 } 1740 fSize += n; 1741 } 1742 byte(uint8_t b)1743 void Assembler::byte(uint8_t b) { this->bytes(&b, 1); } word(uint32_t w)1744 void Assembler::word(uint32_t w) { this->bytes(&w, 4); } 1745 align(int mod)1746 void Assembler::align(int mod) { 1747 while (this->size() % mod) { 1748 this->byte(0x00); 1749 } 1750 } 1751 int3()1752 void Assembler::int3() { 1753 this->byte(0xcc); 1754 } 1755 vzeroupper()1756 void Assembler::vzeroupper() { 1757 this->byte(0xc5); 1758 this->byte(0xf8); 1759 this->byte(0x77); 1760 } ret()1761 void Assembler::ret() { this->byte(0xc3); } 1762 op(int opcode,Operand dst,GP64 x)1763 void Assembler::op(int opcode, Operand dst, GP64 x) { 1764 if (dst.kind == Operand::REG) { 1765 this->byte(rex(W1,x>>3,0,dst.reg>>3)); 1766 this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2); 1767 this->byte(mod_rm(Mod::Direct, x, dst.reg&7)); 1768 } else { 1769 SkASSERT(dst.kind == Operand::MEM); 1770 const Mem& m = dst.mem; 1771 const bool need_SIB = (m.base&7) == rsp 1772 || m.index != rsp; 1773 1774 this->byte(rex(W1,x>>3,m.index>>3,m.base>>3)); 1775 this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2); 1776 this->byte(mod_rm(mod(m.disp), x&7, (need_SIB ? rsp : m.base)&7)); 1777 if (need_SIB) { 1778 this->byte(sib(m.scale, m.index&7, m.base&7)); 1779 } 1780 this->bytes(&m.disp, imm_bytes(mod(m.disp))); 1781 } 1782 } 1783 op(int opcode,int opcode_ext,Operand dst,int imm)1784 void Assembler::op(int opcode, int opcode_ext, Operand dst, int imm) { 1785 opcode |= 0b1000'0000; // top bit set for instructions with any immediate 1786 1787 int imm_bytes = 4; 1788 if (SkTFitsIn<int8_t>(imm)) { 1789 imm_bytes = 1; 1790 opcode |= 0b0000'0010; // second bit set for 8-bit immediate, else 32-bit. 1791 } 1792 1793 this->op(opcode, dst, (GP64)opcode_ext); 1794 this->bytes(&imm, imm_bytes); 1795 } 1796 add(Operand dst,int imm)1797 void Assembler::add(Operand dst, int imm) { this->op(0x01,0b000, dst,imm); } sub(Operand dst,int imm)1798 void Assembler::sub(Operand dst, int imm) { this->op(0x01,0b101, dst,imm); } cmp(Operand dst,int imm)1799 void Assembler::cmp(Operand dst, int imm) { this->op(0x01,0b111, dst,imm); } 1800 1801 // These don't work quite like the other instructions with immediates: 1802 // these immediates are always fixed size at 4 bytes or 1 byte. mov(Operand dst,int imm)1803 void Assembler::mov(Operand dst, int imm) { 1804 this->op(0xC7,dst,(GP64)0b000); 1805 this->word(imm); 1806 } movb(Operand dst,int imm)1807 void Assembler::movb(Operand dst, int imm) { 1808 this->op(0xC6,dst,(GP64)0b000); 1809 this->byte(imm); 1810 } 1811 add(Operand dst,GP64 x)1812 void Assembler::add (Operand dst, GP64 x) { this->op(0x01, dst,x); } sub(Operand dst,GP64 x)1813 void Assembler::sub (Operand dst, GP64 x) { this->op(0x29, dst,x); } cmp(Operand dst,GP64 x)1814 void Assembler::cmp (Operand dst, GP64 x) { this->op(0x39, dst,x); } mov(Operand dst,GP64 x)1815 void Assembler::mov (Operand dst, GP64 x) { this->op(0x89, dst,x); } movb(Operand dst,GP64 x)1816 void Assembler::movb(Operand dst, GP64 x) { this->op(0x88, dst,x); } 1817 add(GP64 dst,Operand x)1818 void Assembler::add (GP64 dst, Operand x) { this->op(0x03, x,dst); } sub(GP64 dst,Operand x)1819 void Assembler::sub (GP64 dst, Operand x) { this->op(0x2B, x,dst); } cmp(GP64 dst,Operand x)1820 void Assembler::cmp (GP64 dst, Operand x) { this->op(0x3B, x,dst); } mov(GP64 dst,Operand x)1821 void Assembler::mov (GP64 dst, Operand x) { this->op(0x8B, x,dst); } movb(GP64 dst,Operand x)1822 void Assembler::movb(GP64 dst, Operand x) { this->op(0x8A, x,dst); } 1823 movzbq(GP64 dst,Operand x)1824 void Assembler::movzbq(GP64 dst, Operand x) { this->op(0xB60F, x,dst); } movzwq(GP64 dst,Operand x)1825 void Assembler::movzwq(GP64 dst, Operand x) { this->op(0xB70F, x,dst); } 1826 vpaddd(Ymm dst,Ymm x,Operand y)1827 void Assembler::vpaddd (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfe, dst,x,y); } vpsubd(Ymm dst,Ymm x,Operand y)1828 void Assembler::vpsubd (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfa, dst,x,y); } vpmulld(Ymm dst,Ymm x,Operand y)1829 void Assembler::vpmulld(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x40, dst,x,y); } 1830 vpaddw(Ymm dst,Ymm x,Operand y)1831 void Assembler::vpaddw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfd, dst,x,y); } vpsubw(Ymm dst,Ymm x,Operand y)1832 void Assembler::vpsubw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xf9, dst,x,y); } vpmullw(Ymm dst,Ymm x,Operand y)1833 void Assembler::vpmullw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xd5, dst,x,y); } vpavgw(Ymm dst,Ymm x,Operand y)1834 void Assembler::vpavgw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xe3, dst,x,y); } vpmulhrsw(Ymm dst,Ymm x,Operand y)1835 void Assembler::vpmulhrsw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x0b, dst,x,y); } vpminsw(Ymm dst,Ymm x,Operand y)1836 void Assembler::vpminsw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xea, dst,x,y); } vpmaxsw(Ymm dst,Ymm x,Operand y)1837 void Assembler::vpmaxsw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xee, dst,x,y); } vpminuw(Ymm dst,Ymm x,Operand y)1838 void Assembler::vpminuw (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3a, dst,x,y); } vpmaxuw(Ymm dst,Ymm x,Operand y)1839 void Assembler::vpmaxuw (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3e, dst,x,y); } 1840 vpabsw(Ymm dst,Operand x)1841 void Assembler::vpabsw(Ymm dst, Operand x) { this->op(0x66,0x380f,0x1d, dst,x); } 1842 1843 vpand(Ymm dst,Ymm x,Operand y)1844 void Assembler::vpand (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdb, dst,x,y); } vpor(Ymm dst,Ymm x,Operand y)1845 void Assembler::vpor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xeb, dst,x,y); } vpxor(Ymm dst,Ymm x,Operand y)1846 void Assembler::vpxor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xef, dst,x,y); } vpandn(Ymm dst,Ymm x,Operand y)1847 void Assembler::vpandn(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdf, dst,x,y); } 1848 vaddps(Ymm dst,Ymm x,Operand y)1849 void Assembler::vaddps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x58, dst,x,y); } vsubps(Ymm dst,Ymm x,Operand y)1850 void Assembler::vsubps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5c, dst,x,y); } vmulps(Ymm dst,Ymm x,Operand y)1851 void Assembler::vmulps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x59, dst,x,y); } vdivps(Ymm dst,Ymm x,Operand y)1852 void Assembler::vdivps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5e, dst,x,y); } vminps(Ymm dst,Ymm x,Operand y)1853 void Assembler::vminps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5d, dst,x,y); } vmaxps(Ymm dst,Ymm x,Operand y)1854 void Assembler::vmaxps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5f, dst,x,y); } 1855 vfmadd132ps(Ymm dst,Ymm x,Operand y)1856 void Assembler::vfmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x98, dst,x,y); } vfmadd213ps(Ymm dst,Ymm x,Operand y)1857 void Assembler::vfmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xa8, dst,x,y); } vfmadd231ps(Ymm dst,Ymm x,Operand y)1858 void Assembler::vfmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xb8, dst,x,y); } 1859 vfmsub132ps(Ymm dst,Ymm x,Operand y)1860 void Assembler::vfmsub132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9a, dst,x,y); } vfmsub213ps(Ymm dst,Ymm x,Operand y)1861 void Assembler::vfmsub213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xaa, dst,x,y); } vfmsub231ps(Ymm dst,Ymm x,Operand y)1862 void Assembler::vfmsub231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xba, dst,x,y); } 1863 vfnmadd132ps(Ymm dst,Ymm x,Operand y)1864 void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9c, dst,x,y); } vfnmadd213ps(Ymm dst,Ymm x,Operand y)1865 void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xac, dst,x,y); } vfnmadd231ps(Ymm dst,Ymm x,Operand y)1866 void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xbc, dst,x,y); } 1867 vpackusdw(Ymm dst,Ymm x,Operand y)1868 void Assembler::vpackusdw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x2b, dst,x,y); } vpackuswb(Ymm dst,Ymm x,Operand y)1869 void Assembler::vpackuswb(Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0x67, dst,x,y); } 1870 vpunpckldq(Ymm dst,Ymm x,Operand y)1871 void Assembler::vpunpckldq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x62, dst,x,y); } vpunpckhdq(Ymm dst,Ymm x,Operand y)1872 void Assembler::vpunpckhdq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x6a, dst,x,y); } 1873 vpcmpeqd(Ymm dst,Ymm x,Operand y)1874 void Assembler::vpcmpeqd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x76, dst,x,y); } vpcmpeqw(Ymm dst,Ymm x,Operand y)1875 void Assembler::vpcmpeqw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x75, dst,x,y); } vpcmpgtd(Ymm dst,Ymm x,Operand y)1876 void Assembler::vpcmpgtd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x66, dst,x,y); } vpcmpgtw(Ymm dst,Ymm x,Operand y)1877 void Assembler::vpcmpgtw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x65, dst,x,y); } 1878 1879 imm_byte_after_operand(const Operand & operand,int imm)1880 void Assembler::imm_byte_after_operand(const Operand& operand, int imm) { 1881 // When we've embedded a label displacement in the middle of an instruction, 1882 // we need to tweak it a little so that the resolved displacement starts 1883 // from the end of the instruction and not the end of the displacement. 1884 if (operand.kind == Operand::LABEL && fCode) { 1885 int disp; 1886 memcpy(&disp, fCode+fSize-4, 4); 1887 disp--; 1888 memcpy(fCode+fSize-4, &disp, 4); 1889 } 1890 this->byte(imm); 1891 } 1892 vcmpps(Ymm dst,Ymm x,Operand y,int imm)1893 void Assembler::vcmpps(Ymm dst, Ymm x, Operand y, int imm) { 1894 this->op(0,0x0f,0xc2, dst,x,y); 1895 this->imm_byte_after_operand(y, imm); 1896 } 1897 vpblendvb(Ymm dst,Ymm x,Operand y,Ymm z)1898 void Assembler::vpblendvb(Ymm dst, Ymm x, Operand y, Ymm z) { 1899 this->op(0x66,0x3a0f,0x4c, dst,x,y); 1900 this->imm_byte_after_operand(y, z << 4); 1901 } 1902 1903 // Shift instructions encode their opcode extension as "dst", dst as x, and x as y. vpslld(Ymm dst,Ymm x,int imm)1904 void Assembler::vpslld(Ymm dst, Ymm x, int imm) { 1905 this->op(0x66,0x0f,0x72,(Ymm)6, dst,x); 1906 this->byte(imm); 1907 } vpsrld(Ymm dst,Ymm x,int imm)1908 void Assembler::vpsrld(Ymm dst, Ymm x, int imm) { 1909 this->op(0x66,0x0f,0x72,(Ymm)2, dst,x); 1910 this->byte(imm); 1911 } vpsrad(Ymm dst,Ymm x,int imm)1912 void Assembler::vpsrad(Ymm dst, Ymm x, int imm) { 1913 this->op(0x66,0x0f,0x72,(Ymm)4, dst,x); 1914 this->byte(imm); 1915 } vpsllw(Ymm dst,Ymm x,int imm)1916 void Assembler::vpsllw(Ymm dst, Ymm x, int imm) { 1917 this->op(0x66,0x0f,0x71,(Ymm)6, dst,x); 1918 this->byte(imm); 1919 } vpsrlw(Ymm dst,Ymm x,int imm)1920 void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) { 1921 this->op(0x66,0x0f,0x71,(Ymm)2, dst,x); 1922 this->byte(imm); 1923 } vpsraw(Ymm dst,Ymm x,int imm)1924 void Assembler::vpsraw(Ymm dst, Ymm x, int imm) { 1925 this->op(0x66,0x0f,0x71,(Ymm)4, dst,x); 1926 this->byte(imm); 1927 } 1928 vpermq(Ymm dst,Operand x,int imm)1929 void Assembler::vpermq(Ymm dst, Operand x, int imm) { 1930 // A bit unusual among the instructions we use, this is 64-bit operation, so we set W. 1931 this->op(0x66,0x3a0f,0x00, dst,x,W1); 1932 this->imm_byte_after_operand(x, imm); 1933 } 1934 vperm2f128(Ymm dst,Ymm x,Operand y,int imm)1935 void Assembler::vperm2f128(Ymm dst, Ymm x, Operand y, int imm) { 1936 this->op(0x66,0x3a0f,0x06, dst,x,y); 1937 this->imm_byte_after_operand(y, imm); 1938 } 1939 vpermps(Ymm dst,Ymm ix,Operand src)1940 void Assembler::vpermps(Ymm dst, Ymm ix, Operand src) { 1941 this->op(0x66,0x380f,0x16, dst,ix,src); 1942 } 1943 vroundps(Ymm dst,Operand x,Rounding imm)1944 void Assembler::vroundps(Ymm dst, Operand x, Rounding imm) { 1945 this->op(0x66,0x3a0f,0x08, dst,x); 1946 this->imm_byte_after_operand(x, imm); 1947 } 1948 vmovdqa(Ymm dst,Operand src)1949 void Assembler::vmovdqa(Ymm dst, Operand src) { this->op(0x66,0x0f,0x6f, dst,src); } vmovups(Ymm dst,Operand src)1950 void Assembler::vmovups(Ymm dst, Operand src) { this->op( 0,0x0f,0x10, dst,src); } vmovups(Xmm dst,Operand src)1951 void Assembler::vmovups(Xmm dst, Operand src) { this->op( 0,0x0f,0x10, dst,src); } vmovups(Operand dst,Ymm src)1952 void Assembler::vmovups(Operand dst, Ymm src) { this->op( 0,0x0f,0x11, src,dst); } vmovups(Operand dst,Xmm src)1953 void Assembler::vmovups(Operand dst, Xmm src) { this->op( 0,0x0f,0x11, src,dst); } 1954 vcvtdq2ps(Ymm dst,Operand x)1955 void Assembler::vcvtdq2ps (Ymm dst, Operand x) { this->op( 0,0x0f,0x5b, dst,x); } vcvttps2dq(Ymm dst,Operand x)1956 void Assembler::vcvttps2dq(Ymm dst, Operand x) { this->op(0xf3,0x0f,0x5b, dst,x); } vcvtps2dq(Ymm dst,Operand x)1957 void Assembler::vcvtps2dq (Ymm dst, Operand x) { this->op(0x66,0x0f,0x5b, dst,x); } vsqrtps(Ymm dst,Operand x)1958 void Assembler::vsqrtps (Ymm dst, Operand x) { this->op( 0,0x0f,0x51, dst,x); } 1959 vcvtps2ph(Operand dst,Ymm x,Rounding imm)1960 void Assembler::vcvtps2ph(Operand dst, Ymm x, Rounding imm) { 1961 this->op(0x66,0x3a0f,0x1d, x,dst); 1962 this->imm_byte_after_operand(dst, imm); 1963 } vcvtph2ps(Ymm dst,Operand x)1964 void Assembler::vcvtph2ps(Ymm dst, Operand x) { 1965 this->op(0x66,0x380f,0x13, dst,x); 1966 } 1967 disp19(Label * l)1968 int Assembler::disp19(Label* l) { 1969 SkASSERT(l->kind == Label::NotYetSet || 1970 l->kind == Label::ARMDisp19); 1971 int here = (int)this->size(); 1972 l->kind = Label::ARMDisp19; 1973 l->references.push_back(here); 1974 // ARM 19-bit instruction count, from the beginning of this instruction. 1975 return (l->offset - here) / 4; 1976 } 1977 disp32(Label * l)1978 int Assembler::disp32(Label* l) { 1979 SkASSERT(l->kind == Label::NotYetSet || 1980 l->kind == Label::X86Disp32); 1981 int here = (int)this->size(); 1982 l->kind = Label::X86Disp32; 1983 l->references.push_back(here); 1984 // x86 32-bit byte count, from the end of this instruction. 1985 return l->offset - (here + 4); 1986 } 1987 op(int prefix,int map,int opcode,int dst,int x,Operand y,W w,L l)1988 void Assembler::op(int prefix, int map, int opcode, int dst, int x, Operand y, W w, L l) { 1989 switch (y.kind) { 1990 case Operand::REG: { 1991 VEX v = vex(w, dst>>3, 0, y.reg>>3, 1992 map, x, l, prefix); 1993 this->bytes(v.bytes, v.len); 1994 this->byte(opcode); 1995 this->byte(mod_rm(Mod::Direct, dst&7, y.reg&7)); 1996 } return; 1997 1998 case Operand::MEM: { 1999 // Passing rsp as the rm argument to mod_rm() signals an SIB byte follows; 2000 // without an SIB byte, that's where the base register would usually go. 2001 // This means we have to use an SIB byte if we want to use rsp as a base register. 2002 const Mem& m = y.mem; 2003 const bool need_SIB = m.base == rsp 2004 || m.index != rsp; 2005 2006 VEX v = vex(w, dst>>3, m.index>>3, m.base>>3, 2007 map, x, l, prefix); 2008 this->bytes(v.bytes, v.len); 2009 this->byte(opcode); 2010 this->byte(mod_rm(mod(m.disp), dst&7, (need_SIB ? rsp : m.base)&7)); 2011 if (need_SIB) { 2012 this->byte(sib(m.scale, m.index&7, m.base&7)); 2013 } 2014 this->bytes(&m.disp, imm_bytes(mod(m.disp))); 2015 } return; 2016 2017 case Operand::LABEL: { 2018 // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13. 2019 const int rip = rbp; 2020 2021 VEX v = vex(w, dst>>3, 0, rip>>3, 2022 map, x, l, prefix); 2023 this->bytes(v.bytes, v.len); 2024 this->byte(opcode); 2025 this->byte(mod_rm(Mod::Indirect, dst&7, rip&7)); 2026 this->word(this->disp32(y.label)); 2027 } return; 2028 } 2029 } 2030 vpshufb(Ymm dst,Ymm x,Operand y)2031 void Assembler::vpshufb(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x00, dst,x,y); } 2032 vptest(Ymm x,Operand y)2033 void Assembler::vptest(Ymm x, Operand y) { this->op(0x66, 0x380f, 0x17, x,y); } 2034 vbroadcastss(Ymm dst,Operand y)2035 void Assembler::vbroadcastss(Ymm dst, Operand y) { this->op(0x66,0x380f,0x18, dst,y); } 2036 jump(uint8_t condition,Label * l)2037 void Assembler::jump(uint8_t condition, Label* l) { 2038 // These conditional jumps can be either 2 bytes (short) or 6 bytes (near): 2039 // 7? one-byte-disp 2040 // 0F 8? four-byte-disp 2041 // We always use the near displacement to make updating labels simpler (no resizing). 2042 this->byte(0x0f); 2043 this->byte(condition); 2044 this->word(this->disp32(l)); 2045 } je(Label * l)2046 void Assembler::je (Label* l) { this->jump(0x84, l); } jne(Label * l)2047 void Assembler::jne(Label* l) { this->jump(0x85, l); } jl(Label * l)2048 void Assembler::jl (Label* l) { this->jump(0x8c, l); } jc(Label * l)2049 void Assembler::jc (Label* l) { this->jump(0x82, l); } 2050 jmp(Label * l)2051 void Assembler::jmp(Label* l) { 2052 // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit. 2053 this->byte(0xe9); 2054 this->word(this->disp32(l)); 2055 } 2056 vpmovzxwd(Ymm dst,Operand src)2057 void Assembler::vpmovzxwd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x33, dst,src); } vpmovzxbd(Ymm dst,Operand src)2058 void Assembler::vpmovzxbd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x31, dst,src); } 2059 vmovq(Operand dst,Xmm src)2060 void Assembler::vmovq(Operand dst, Xmm src) { this->op(0x66,0x0f,0xd6, src,dst); } 2061 vmovd(Operand dst,Xmm src)2062 void Assembler::vmovd(Operand dst, Xmm src) { this->op(0x66,0x0f,0x7e, src,dst); } vmovd(Xmm dst,Operand src)2063 void Assembler::vmovd(Xmm dst, Operand src) { this->op(0x66,0x0f,0x6e, dst,src); } 2064 vpinsrd(Xmm dst,Xmm src,Operand y,int imm)2065 void Assembler::vpinsrd(Xmm dst, Xmm src, Operand y, int imm) { 2066 this->op(0x66,0x3a0f,0x22, dst,src,y); 2067 this->imm_byte_after_operand(y, imm); 2068 } vpinsrw(Xmm dst,Xmm src,Operand y,int imm)2069 void Assembler::vpinsrw(Xmm dst, Xmm src, Operand y, int imm) { 2070 this->op(0x66,0x0f,0xc4, dst,src,y); 2071 this->imm_byte_after_operand(y, imm); 2072 } vpinsrb(Xmm dst,Xmm src,Operand y,int imm)2073 void Assembler::vpinsrb(Xmm dst, Xmm src, Operand y, int imm) { 2074 this->op(0x66,0x3a0f,0x20, dst,src,y); 2075 this->imm_byte_after_operand(y, imm); 2076 } 2077 vextracti128(Operand dst,Ymm src,int imm)2078 void Assembler::vextracti128(Operand dst, Ymm src, int imm) { 2079 this->op(0x66,0x3a0f,0x39, src,dst); 2080 SkASSERT(dst.kind != Operand::LABEL); 2081 this->byte(imm); 2082 } vpextrd(Operand dst,Xmm src,int imm)2083 void Assembler::vpextrd(Operand dst, Xmm src, int imm) { 2084 this->op(0x66,0x3a0f,0x16, src,dst); 2085 SkASSERT(dst.kind != Operand::LABEL); 2086 this->byte(imm); 2087 } vpextrw(Operand dst,Xmm src,int imm)2088 void Assembler::vpextrw(Operand dst, Xmm src, int imm) { 2089 this->op(0x66,0x3a0f,0x15, src,dst); 2090 SkASSERT(dst.kind != Operand::LABEL); 2091 this->byte(imm); 2092 } vpextrb(Operand dst,Xmm src,int imm)2093 void Assembler::vpextrb(Operand dst, Xmm src, int imm) { 2094 this->op(0x66,0x3a0f,0x14, src,dst); 2095 SkASSERT(dst.kind != Operand::LABEL); 2096 this->byte(imm); 2097 } 2098 vgatherdps(Ymm dst,Scale scale,Ymm ix,GP64 base,Ymm mask)2099 void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) { 2100 // Unlike most instructions, no aliasing is permitted here. 2101 SkASSERT(dst != ix); 2102 SkASSERT(dst != mask); 2103 SkASSERT(mask != ix); 2104 2105 int prefix = 0x66, 2106 map = 0x380f, 2107 opcode = 0x92; 2108 VEX v = vex(0, dst>>3, ix>>3, base>>3, 2109 map, mask, /*ymm?*/1, prefix); 2110 this->bytes(v.bytes, v.len); 2111 this->byte(opcode); 2112 this->byte(mod_rm(Mod::Indirect, dst&7, rsp/*use SIB*/)); 2113 this->byte(sib(scale, ix&7, base&7)); 2114 } 2115 2116 // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf 2117 operator ""_mask(unsigned long long bits)2118 static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; } 2119 op(uint32_t hi,V m,uint32_t lo,V n,V d)2120 void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) { 2121 this->word( (hi & 11_mask) << 21 2122 | (m & 5_mask) << 16 2123 | (lo & 6_mask) << 10 2124 | (n & 5_mask) << 5 2125 | (d & 5_mask) << 0); 2126 } op(uint32_t op22,V n,V d,int imm)2127 void Assembler::op(uint32_t op22, V n, V d, int imm) { 2128 this->word( (op22 & 22_mask) << 10 2129 | imm // size and location depends on the instruction 2130 | (n & 5_mask) << 5 2131 | (d & 5_mask) << 0); 2132 } 2133 and16b(V d,V n,V m)2134 void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); } orr16b(V d,V n,V m)2135 void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); } eor16b(V d,V n,V m)2136 void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); } bic16b(V d,V n,V m)2137 void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); } bsl16b(V d,V n,V m)2138 void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); } not16b(V d,V n)2139 void Assembler::not16b(V d, V n) { this->op(0b0'1'1'01110'00'10000'00101'10, n, d); } 2140 add4s(V d,V n,V m)2141 void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); } sub4s(V d,V n,V m)2142 void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); } mul4s(V d,V n,V m)2143 void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); } 2144 cmeq4s(V d,V n,V m)2145 void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); } cmgt4s(V d,V n,V m)2146 void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); } 2147 sub8h(V d,V n,V m)2148 void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); } mul8h(V d,V n,V m)2149 void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); } 2150 fadd4s(V d,V n,V m)2151 void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); } fsub4s(V d,V n,V m)2152 void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); } fmul4s(V d,V n,V m)2153 void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); } fdiv4s(V d,V n,V m)2154 void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); } fmin4s(V d,V n,V m)2155 void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); } fmax4s(V d,V n,V m)2156 void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); } 2157 fneg4s(V d,V n)2158 void Assembler::fneg4s (V d, V n) { this->op(0b0'1'1'01110'1'0'10000'01111'10, n,d); } fsqrt4s(V d,V n)2159 void Assembler::fsqrt4s(V d, V n) { this->op(0b0'1'1'01110'1'0'10000'11111'10, n,d); } 2160 fcmeq4s(V d,V n,V m)2161 void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); } fcmgt4s(V d,V n,V m)2162 void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); } fcmge4s(V d,V n,V m)2163 void Assembler::fcmge4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b1110'0'1, n, d); } 2164 fmla4s(V d,V n,V m)2165 void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); } fmls4s(V d,V n,V m)2166 void Assembler::fmls4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11001'1, n, d); } 2167 tbl(V d,V n,V m)2168 void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); } 2169 uzp14s(V d,V n,V m)2170 void Assembler::uzp14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'01'10, n, d); } uzp24s(V d,V n,V m)2171 void Assembler::uzp24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'01'10, n, d); } zip14s(V d,V n,V m)2172 void Assembler::zip14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'11'10, n, d); } zip24s(V d,V n,V m)2173 void Assembler::zip24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'11'10, n, d); } 2174 sli4s(V d,V n,int imm5)2175 void Assembler::sli4s(V d, V n, int imm5) { 2176 this->op(0b0'1'1'011110'0100'000'01010'1, n, d, ( imm5 & 5_mask)<<16); 2177 } shl4s(V d,V n,int imm5)2178 void Assembler::shl4s(V d, V n, int imm5) { 2179 this->op(0b0'1'0'011110'0100'000'01010'1, n, d, ( imm5 & 5_mask)<<16); 2180 } sshr4s(V d,V n,int imm5)2181 void Assembler::sshr4s(V d, V n, int imm5) { 2182 this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16); 2183 } ushr4s(V d,V n,int imm5)2184 void Assembler::ushr4s(V d, V n, int imm5) { 2185 this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16); 2186 } ushr8h(V d,V n,int imm4)2187 void Assembler::ushr8h(V d, V n, int imm4) { 2188 this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, n, d, (-imm4 & 4_mask)<<16); 2189 } 2190 scvtf4s(V d,V n)2191 void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); } fcvtzs4s(V d,V n)2192 void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); } fcvtns4s(V d,V n)2193 void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); } frintp4s(V d,V n)2194 void Assembler::frintp4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1100'0'10, n,d); } frintm4s(V d,V n)2195 void Assembler::frintm4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1100'1'10, n,d); } 2196 fcvtn(V d,V n)2197 void Assembler::fcvtn(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10110'10, n,d); } fcvtl(V d,V n)2198 void Assembler::fcvtl(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10111'10, n,d); } 2199 xtns2h(V d,V n)2200 void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); } xtnh2b(V d,V n)2201 void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); } 2202 uxtlb2h(V d,V n)2203 void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); } uxtlh2s(V d,V n)2204 void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); } 2205 uminv4s(V d,V n)2206 void Assembler::uminv4s(V d, V n) { this->op(0b0'1'1'01110'10'11000'1'1010'10, n,d); } 2207 brk(int imm16)2208 void Assembler::brk(int imm16) { 2209 this->op(0b11010100'001'00000000000, (imm16 & 16_mask) << 5); 2210 } 2211 ret(X n)2212 void Assembler::ret(X n) { this->op(0b1101011'0'0'10'11111'0000'0'0, n, (X)0); } 2213 add(X d,X n,int imm12)2214 void Assembler::add(X d, X n, int imm12) { 2215 this->op(0b1'0'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10); 2216 } sub(X d,X n,int imm12)2217 void Assembler::sub(X d, X n, int imm12) { 2218 this->op(0b1'1'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10); 2219 } subs(X d,X n,int imm12)2220 void Assembler::subs(X d, X n, int imm12) { 2221 this->op(0b1'1'1'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10); 2222 } 2223 add(X d,X n,X m,Shift shift,int imm6)2224 void Assembler::add(X d, X n, X m, Shift shift, int imm6) { 2225 SkASSERT(shift != ROR); 2226 2227 int imm = (imm6 & 6_mask) << 0 2228 | (m & 5_mask) << 6 2229 | (0 & 1_mask) << 11 2230 | (shift & 2_mask) << 12; 2231 this->op(0b1'0'0'01011'00'0'00000'000000, n,d, imm << 10); 2232 } 2233 b(Condition cond,Label * l)2234 void Assembler::b(Condition cond, Label* l) { 2235 const int imm19 = this->disp19(l); 2236 this->op(0b0101010'0'00000000000000, (X)0, (V)cond, (imm19 & 19_mask) << 5); 2237 } cbz(X t,Label * l)2238 void Assembler::cbz(X t, Label* l) { 2239 const int imm19 = this->disp19(l); 2240 this->op(0b1'011010'0'00000000000000, (X)0, t, (imm19 & 19_mask) << 5); 2241 } cbnz(X t,Label * l)2242 void Assembler::cbnz(X t, Label* l) { 2243 const int imm19 = this->disp19(l); 2244 this->op(0b1'011010'1'00000000000000, (X)0, t, (imm19 & 19_mask) << 5); 2245 } 2246 ldrd(X dst,X src,int imm12)2247 void Assembler::ldrd(X dst, X src, int imm12) { 2248 this->op(0b11'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2249 } ldrs(X dst,X src,int imm12)2250 void Assembler::ldrs(X dst, X src, int imm12) { 2251 this->op(0b10'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2252 } ldrh(X dst,X src,int imm12)2253 void Assembler::ldrh(X dst, X src, int imm12) { 2254 this->op(0b01'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2255 } ldrb(X dst,X src,int imm12)2256 void Assembler::ldrb(X dst, X src, int imm12) { 2257 this->op(0b00'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2258 } 2259 ldrq(V dst,X src,int imm12)2260 void Assembler::ldrq(V dst, X src, int imm12) { 2261 this->op(0b00'111'1'01'11'000000000000, src, dst, (imm12 & 12_mask) << 10); 2262 } ldrd(V dst,X src,int imm12)2263 void Assembler::ldrd(V dst, X src, int imm12) { 2264 this->op(0b11'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2265 } ldrs(V dst,X src,int imm12)2266 void Assembler::ldrs(V dst, X src, int imm12) { 2267 this->op(0b10'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2268 } ldrh(V dst,X src,int imm12)2269 void Assembler::ldrh(V dst, X src, int imm12) { 2270 this->op(0b01'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2271 } ldrb(V dst,X src,int imm12)2272 void Assembler::ldrb(V dst, X src, int imm12) { 2273 this->op(0b00'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2274 } 2275 strs(X src,X dst,int imm12)2276 void Assembler::strs(X src, X dst, int imm12) { 2277 this->op(0b10'111'0'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); 2278 } 2279 strq(V src,X dst,int imm12)2280 void Assembler::strq(V src, X dst, int imm12) { 2281 this->op(0b00'111'1'01'10'000000000000, dst, src, (imm12 & 12_mask) << 10); 2282 } strd(V src,X dst,int imm12)2283 void Assembler::strd(V src, X dst, int imm12) { 2284 this->op(0b11'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); 2285 } strs(V src,X dst,int imm12)2286 void Assembler::strs(V src, X dst, int imm12) { 2287 this->op(0b10'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); 2288 } strh(V src,X dst,int imm12)2289 void Assembler::strh(V src, X dst, int imm12) { 2290 this->op(0b01'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); 2291 } strb(V src,X dst,int imm12)2292 void Assembler::strb(V src, X dst, int imm12) { 2293 this->op(0b00'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); 2294 } 2295 movs(X dst,V src,int lane)2296 void Assembler::movs(X dst, V src, int lane) { 2297 int imm5 = (lane << 3) | 0b100; 2298 this->op(0b0'0'0'01110000'00000'0'01'1'1'1, src, dst, (imm5 & 5_mask) << 16); 2299 } inss(V dst,X src,int lane)2300 void Assembler::inss(V dst, X src, int lane) { 2301 int imm5 = (lane << 3) | 0b100; 2302 this->op(0b0'1'0'01110000'00000'0'0011'1, src, dst, (imm5 & 5_mask) << 16); 2303 } 2304 2305 ldrq(V dst,Label * l)2306 void Assembler::ldrq(V dst, Label* l) { 2307 const int imm19 = this->disp19(l); 2308 this->op(0b10'011'1'00'00000000000000, (V)0, dst, (imm19 & 19_mask) << 5); 2309 } 2310 dup4s(V dst,X src)2311 void Assembler::dup4s(V dst, X src) { 2312 this->op(0b0'1'0'01110000'00100'0'0001'1, src, dst); 2313 } 2314 ld1r4s(V dst,X src)2315 void Assembler::ld1r4s(V dst, X src) { 2316 this->op(0b0'1'0011010'1'0'00000'110'0'10, src, dst); 2317 } ld1r8h(V dst,X src)2318 void Assembler::ld1r8h(V dst, X src) { 2319 this->op(0b0'1'0011010'1'0'00000'110'0'01, src, dst); 2320 } ld1r16b(V dst,X src)2321 void Assembler::ld1r16b(V dst, X src) { 2322 this->op(0b0'1'0011010'1'0'00000'110'0'00, src, dst); 2323 } 2324 ld24s(V dst,X src)2325 void Assembler::ld24s(V dst, X src) { this->op(0b0'1'0011000'1'000000'1000'10, src, dst); } ld44s(V dst,X src)2326 void Assembler::ld44s(V dst, X src) { this->op(0b0'1'0011000'1'000000'0000'10, src, dst); } st24s(V src,X dst)2327 void Assembler::st24s(V src, X dst) { this->op(0b0'1'0011000'0'000000'1000'10, dst, src); } st44s(V src,X dst)2328 void Assembler::st44s(V src, X dst) { this->op(0b0'1'0011000'0'000000'0000'10, dst, src); } 2329 ld24s(V dst,X src,int lane)2330 void Assembler::ld24s(V dst, X src, int lane) { 2331 int Q = (lane & 2)>>1, 2332 S = (lane & 1); 2333 /* Q S */ 2334 this->op(0b0'0'0011010'1'1'00000'100'0'00, src, dst, (Q<<30)|(S<<12)); 2335 } ld44s(V dst,X src,int lane)2336 void Assembler::ld44s(V dst, X src, int lane) { 2337 int Q = (lane & 2)>>1, 2338 S = (lane & 1); 2339 this->op(0b0'0'0011010'1'1'00000'101'0'00, src, dst, (Q<<30)|(S<<12)); 2340 } 2341 label(Label * l)2342 void Assembler::label(Label* l) { 2343 if (fCode) { 2344 // The instructions all currently point to l->offset. 2345 // We'll want to add a delta to point them to here. 2346 int here = (int)this->size(); 2347 int delta = here - l->offset; 2348 l->offset = here; 2349 2350 if (l->kind == Label::ARMDisp19) { 2351 for (int ref : l->references) { 2352 // ref points to a 32-bit instruction with 19-bit displacement in instructions. 2353 uint32_t inst; 2354 memcpy(&inst, fCode + ref, 4); 2355 2356 // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ] 2357 int disp = (int)(inst << 8) >> 13; 2358 2359 disp += delta/4; // delta is in bytes, we want instructions. 2360 2361 // Put it all back together, preserving the high 8 bits and low 5. 2362 inst = ((disp << 5) & (19_mask << 5)) 2363 | ((inst ) & ~(19_mask << 5)); 2364 2365 memcpy(fCode + ref, &inst, 4); 2366 } 2367 } 2368 2369 if (l->kind == Label::X86Disp32) { 2370 for (int ref : l->references) { 2371 // ref points to a 32-bit displacement in bytes. 2372 int disp; 2373 memcpy(&disp, fCode + ref, 4); 2374 2375 disp += delta; 2376 2377 memcpy(fCode + ref, &disp, 4); 2378 } 2379 } 2380 } 2381 } 2382 eval(int n,void * args[]) const2383 void Program::eval(int n, void* args[]) const { 2384 #define SKVM_JIT_STATS 0 2385 #if SKVM_JIT_STATS 2386 static std::atomic<int64_t> calls{0}, jits{0}, 2387 pixels{0}, fast{0}; 2388 pixels += n; 2389 if (0 == calls++) { 2390 atexit([]{ 2391 int64_t num = jits .load(), 2392 den = calls.load(); 2393 SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n", (100.0 * num)/den, den); 2394 num = fast .load(); 2395 den = pixels.load(); 2396 SkDebugf("%.3g%% of %lld pixels went through JIT.\n", (100.0 * num)/den, den); 2397 }); 2398 } 2399 #endif 2400 2401 #if !defined(SKVM_JIT_BUT_IGNORE_IT) 2402 const void* jit_entry = fImpl->jit_entry.load(); 2403 // jit_entry may be null either simply because we can't JIT, or when using LLVM 2404 // if the work represented by fImpl->llvm_compiling hasn't finished yet. 2405 // 2406 // Ordinarily we'd never find ourselves with non-null jit_entry and !gSkVMAllowJIT, but it 2407 // can happen during interactive programs like Viewer that toggle gSkVMAllowJIT on and off, 2408 // due to timing or program caching. 2409 if (jit_entry != nullptr && gSkVMAllowJIT) { 2410 #if SKVM_JIT_STATS 2411 jits++; 2412 fast += n; 2413 #endif 2414 void** a = args; 2415 switch (fImpl->strides.size()) { 2416 case 0: return ((void(*)(int ))jit_entry)(n ); 2417 case 1: return ((void(*)(int,void* ))jit_entry)(n,a[0] ); 2418 case 2: return ((void(*)(int,void*,void* ))jit_entry)(n,a[0],a[1] ); 2419 case 3: return ((void(*)(int,void*,void*,void* ))jit_entry)(n,a[0],a[1],a[2]); 2420 case 4: return ((void(*)(int,void*,void*,void*,void*))jit_entry) 2421 (n,a[0],a[1],a[2],a[3]); 2422 case 5: return ((void(*)(int,void*,void*,void*,void*,void*))jit_entry) 2423 (n,a[0],a[1],a[2],a[3],a[4]); 2424 case 6: return ((void(*)(int,void*,void*,void*,void*,void*,void*))jit_entry) 2425 (n,a[0],a[1],a[2],a[3],a[4],a[5]); 2426 case 7: return ((void(*)(int,void*,void*,void*,void*,void*,void*,void*))jit_entry) 2427 (n,a[0],a[1],a[2],a[3],a[4],a[5],a[6]); 2428 default: SkASSERT(fImpl->strides.size() <= 7); 2429 } 2430 } 2431 #endif 2432 2433 // So we'll sometimes use the interpreter here even if later calls will use the JIT. 2434 SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(), 2435 this->nregs(), this->loop(), fImpl->strides.data(), this->nargs(), 2436 n, args); 2437 } 2438 2439 #if defined(SKVM_LLVM) setupLLVM(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)2440 void Program::setupLLVM(const std::vector<OptimizedInstruction>& instructions, 2441 const char* debug_name) { 2442 auto ctx = std::make_unique<llvm::LLVMContext>(); 2443 2444 auto mod = std::make_unique<llvm::Module>("", *ctx); 2445 // All the scary bare pointers from here on are owned by ctx or mod, I think. 2446 2447 // Everything I've tested runs faster at K=8 (using ymm) than K=16 (zmm) on SKX machines. 2448 const int K = (true && SkCpu::Supports(SkCpu::HSW)) ? 8 : 4; 2449 2450 llvm::Type *ptr = llvm::Type::getInt8Ty(*ctx)->getPointerTo(), 2451 *i32 = llvm::Type::getInt32Ty(*ctx); 2452 2453 std::vector<llvm::Type*> arg_types = { i32 }; 2454 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2455 arg_types.push_back(ptr); 2456 } 2457 2458 llvm::FunctionType* fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*ctx), 2459 arg_types, /*vararg?=*/false); 2460 llvm::Function* fn 2461 = llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, debug_name, *mod); 2462 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2463 fn->addParamAttr(i+1, llvm::Attribute::NoAlias); 2464 } 2465 2466 llvm::BasicBlock *enter = llvm::BasicBlock::Create(*ctx, "enter" , fn), 2467 *hoistK = llvm::BasicBlock::Create(*ctx, "hoistK", fn), 2468 *testK = llvm::BasicBlock::Create(*ctx, "testK" , fn), 2469 *loopK = llvm::BasicBlock::Create(*ctx, "loopK" , fn), 2470 *hoist1 = llvm::BasicBlock::Create(*ctx, "hoist1", fn), 2471 *test1 = llvm::BasicBlock::Create(*ctx, "test1" , fn), 2472 *loop1 = llvm::BasicBlock::Create(*ctx, "loop1" , fn), 2473 *leave = llvm::BasicBlock::Create(*ctx, "leave" , fn); 2474 2475 using IRBuilder = llvm::IRBuilder<>; 2476 2477 llvm::PHINode* n; 2478 std::vector<llvm::PHINode*> args; 2479 std::vector<llvm::Value*> vals(instructions.size()); 2480 2481 auto emit = [&](size_t i, bool scalar, IRBuilder* b) { 2482 auto [op, x,y,z,w, immA,immB, death,can_hoist] = instructions[i]; 2483 2484 llvm::Type *i1 = llvm::Type::getInt1Ty (*ctx), 2485 *i8 = llvm::Type::getInt8Ty (*ctx), 2486 *i16 = llvm::Type::getInt16Ty(*ctx), 2487 *f32 = llvm::Type::getFloatTy(*ctx), 2488 *I1 = scalar ? i1 : llvm::VectorType::get(i1 , K ), 2489 *I8 = scalar ? i8 : llvm::VectorType::get(i8 , K ), 2490 *I16 = scalar ? i16 : llvm::VectorType::get(i16, K ), 2491 *I32 = scalar ? i32 : llvm::VectorType::get(i32, K ), 2492 *F32 = scalar ? f32 : llvm::VectorType::get(f32, K ); 2493 2494 auto I = [&](llvm::Value* v) { return b->CreateBitCast(v, I32 ); }; 2495 auto F = [&](llvm::Value* v) { return b->CreateBitCast(v, F32 ); }; 2496 2497 auto S = [&](llvm::Type* dst, llvm::Value* v) { return b->CreateSExt(v, dst); }; 2498 2499 switch (llvm::Type* t = nullptr; op) { 2500 default: 2501 SkDebugf("can't llvm %s (%d)\n", name(op), op); 2502 return false; 2503 2504 case Op::assert_true: /*TODO*/ break; 2505 2506 case Op::index: 2507 if (I32->isVectorTy()) { 2508 std::vector<llvm::Constant*> iota(K); 2509 for (int j = 0; j < K; j++) { 2510 iota[j] = b->getInt32(j); 2511 } 2512 vals[i] = b->CreateSub(b->CreateVectorSplat(K, n), 2513 llvm::ConstantVector::get(iota)); 2514 } else { 2515 vals[i] = n; 2516 } break; 2517 2518 case Op::load8: t = I8 ; goto load; 2519 case Op::load16: t = I16; goto load; 2520 case Op::load32: t = I32; goto load; 2521 load: { 2522 llvm::Value* ptr = b->CreateBitCast(args[immA], t->getPointerTo()); 2523 vals[i] = b->CreateZExt(b->CreateAlignedLoad(ptr, 1), I32); 2524 } break; 2525 2526 2527 case Op::splat: vals[i] = llvm::ConstantInt::get(I32, immA); break; 2528 2529 case Op::uniform32: { 2530 llvm::Value* ptr = b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr, 2531 args[immA], 2532 immB), 2533 i32->getPointerTo()); 2534 llvm::Value* val = b->CreateZExt(b->CreateAlignedLoad(ptr, 1), i32); 2535 vals[i] = I32->isVectorTy() ? b->CreateVectorSplat(K, val) 2536 : val; 2537 } break; 2538 2539 case Op::gather8: t = i8 ; goto gather; 2540 case Op::gather16: t = i16; goto gather; 2541 case Op::gather32: t = i32; goto gather; 2542 gather: { 2543 // Our gather base pointer is immB bytes off of uniform immA. 2544 llvm::Value* base = 2545 b->CreateLoad(b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr, 2546 args[immA], 2547 immB), 2548 t->getPointerTo()->getPointerTo())); 2549 2550 llvm::Value* ptr = b->CreateInBoundsGEP(nullptr, base, vals[x]); 2551 llvm::Value* gathered; 2552 if (ptr->getType()->isVectorTy()) { 2553 gathered = b->CreateMaskedGather(ptr, 1); 2554 } else { 2555 gathered = b->CreateAlignedLoad(ptr, 1); 2556 } 2557 vals[i] = b->CreateZExt(gathered, I32); 2558 } break; 2559 2560 case Op::store8: t = I8 ; goto store; 2561 case Op::store16: t = I16; goto store; 2562 case Op::store32: t = I32; goto store; 2563 store: { 2564 llvm::Value* val = b->CreateTrunc(vals[x], t); 2565 llvm::Value* ptr = b->CreateBitCast(args[immA], 2566 val->getType()->getPointerTo()); 2567 vals[i] = b->CreateAlignedStore(val, ptr, 1); 2568 } break; 2569 2570 case Op::bit_and: vals[i] = b->CreateAnd(vals[x], vals[y]); break; 2571 case Op::bit_or : vals[i] = b->CreateOr (vals[x], vals[y]); break; 2572 case Op::bit_xor: vals[i] = b->CreateXor(vals[x], vals[y]); break; 2573 case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break; 2574 2575 case Op::select: 2576 vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]); 2577 break; 2578 2579 case Op::add_i32: vals[i] = b->CreateAdd(vals[x], vals[y]); break; 2580 case Op::sub_i32: vals[i] = b->CreateSub(vals[x], vals[y]); break; 2581 case Op::mul_i32: vals[i] = b->CreateMul(vals[x], vals[y]); break; 2582 2583 case Op::shl_i32: vals[i] = b->CreateShl (vals[x], immA); break; 2584 case Op::sra_i32: vals[i] = b->CreateAShr(vals[x], immA); break; 2585 case Op::shr_i32: vals[i] = b->CreateLShr(vals[x], immA); break; 2586 2587 case Op:: eq_i32: vals[i] = S(I32, b->CreateICmpEQ (vals[x], vals[y])); break; 2588 case Op:: gt_i32: vals[i] = S(I32, b->CreateICmpSGT(vals[x], vals[y])); break; 2589 2590 case Op::add_f32: vals[i] = I(b->CreateFAdd(F(vals[x]), F(vals[y]))); break; 2591 case Op::sub_f32: vals[i] = I(b->CreateFSub(F(vals[x]), F(vals[y]))); break; 2592 case Op::mul_f32: vals[i] = I(b->CreateFMul(F(vals[x]), F(vals[y]))); break; 2593 case Op::div_f32: vals[i] = I(b->CreateFDiv(F(vals[x]), F(vals[y]))); break; 2594 2595 case Op:: eq_f32: vals[i] = S(I32, b->CreateFCmpOEQ(F(vals[x]), F(vals[y]))); break; 2596 case Op::neq_f32: vals[i] = S(I32, b->CreateFCmpUNE(F(vals[x]), F(vals[y]))); break; 2597 case Op:: gt_f32: vals[i] = S(I32, b->CreateFCmpOGT(F(vals[x]), F(vals[y]))); break; 2598 case Op::gte_f32: vals[i] = S(I32, b->CreateFCmpOGE(F(vals[x]), F(vals[y]))); break; 2599 2600 case Op::fma_f32: 2601 vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, 2602 {F(vals[x]), F(vals[y]), F(vals[z])})); 2603 break; 2604 2605 case Op::fms_f32: 2606 vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, 2607 {F(vals[x]), F(vals[y]), 2608 b->CreateFNeg(F(vals[z]))})); 2609 break; 2610 2611 case Op::fnma_f32: 2612 vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, 2613 {b->CreateFNeg(F(vals[x])), F(vals[y]), 2614 F(vals[z])})); 2615 break; 2616 2617 case Op::ceil: 2618 vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::ceil, F(vals[x]))); 2619 break; 2620 case Op::floor: 2621 vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::floor, F(vals[x]))); 2622 break; 2623 2624 case Op::max_f32: 2625 vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[x]), F(vals[y])), 2626 F(vals[y]), F(vals[x]))); 2627 break; 2628 case Op::min_f32: 2629 vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[y]), F(vals[x])), 2630 F(vals[y]), F(vals[x]))); 2631 break; 2632 2633 case Op::sqrt_f32: 2634 vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, F(vals[x]))); 2635 break; 2636 2637 case Op::to_f32: vals[i] = I(b->CreateSIToFP( vals[x] , F32)); break; 2638 case Op::trunc : vals[i] = b->CreateFPToSI(F(vals[x]), I32) ; break; 2639 case Op::round : { 2640 // Basic impl when we can't use cvtps2dq and co. 2641 auto round = b->CreateUnaryIntrinsic(llvm::Intrinsic::rint, F(vals[x])); 2642 vals[i] = b->CreateFPToSI(round, I32); 2643 2644 #if 1 && defined(SK_CPU_X86) 2645 // Using b->CreateIntrinsic(..., {}, {...}) to avoid name mangling. 2646 if (scalar) { 2647 // cvtss2si is float x4 -> int, ignoring input lanes 1,2,3. ¯\_(ツ)_/¯ 2648 llvm::Value* v = llvm::UndefValue::get(llvm::VectorType::get(f32, 4)); 2649 v = b->CreateInsertElement(v, F(vals[x]), (uint64_t)0); 2650 vals[i] = b->CreateIntrinsic(llvm::Intrinsic::x86_sse_cvtss2si, {}, {v}); 2651 } else { 2652 SkASSERT(K == 4 || K == 8); 2653 auto intr = K == 4 ? llvm::Intrinsic::x86_sse2_cvtps2dq : 2654 /* K == 8 ?*/ llvm::Intrinsic::x86_avx_cvt_ps2dq_256; 2655 vals[i] = b->CreateIntrinsic(intr, {}, {F(vals[x])}); 2656 } 2657 #endif 2658 } break; 2659 2660 } 2661 return true; 2662 }; 2663 2664 { 2665 IRBuilder b(enter); 2666 b.CreateBr(hoistK); 2667 } 2668 2669 // hoistK: emit each hoistable vector instruction; goto testK; 2670 // LLVM can do this sort of thing itself, but we've got the information cheap, 2671 // and pointer aliasing makes it easier to manually hoist than teach LLVM it's safe. 2672 { 2673 IRBuilder b(hoistK); 2674 2675 // Hoisted instructions will need args (think, uniforms), so set that up now. 2676 // These phi nodes are degenerate... they'll always be the passed-in args from enter. 2677 // Later on when we start looping the phi nodes will start looking useful. 2678 llvm::Argument* arg = fn->arg_begin(); 2679 (void)arg++; // Leave n as nullptr... it'd be a bug to use n in a hoisted instruction. 2680 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2681 args.push_back(b.CreatePHI(arg->getType(), 1)); 2682 args.back()->addIncoming(arg++, enter); 2683 } 2684 2685 for (size_t i = 0; i < instructions.size(); i++) { 2686 if (instructions[i].can_hoist && !emit(i, false, &b)) { 2687 return; 2688 } 2689 } 2690 2691 b.CreateBr(testK); 2692 } 2693 2694 // testK: if (N >= K) goto loopK; else goto hoist1; 2695 { 2696 IRBuilder b(testK); 2697 2698 // New phi nodes for `n` and each pointer argument from hoistK; later we'll add loopK. 2699 // These also start as the initial function arguments; hoistK can't have changed them. 2700 llvm::Argument* arg = fn->arg_begin(); 2701 2702 n = b.CreatePHI(arg->getType(), 2); 2703 n->addIncoming(arg++, hoistK); 2704 2705 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2706 args[i] = b.CreatePHI(arg->getType(), 2); 2707 args[i]->addIncoming(arg++, hoistK); 2708 } 2709 2710 b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(K)), loopK, hoist1); 2711 } 2712 2713 // loopK: ... insts on K x T vectors; N -= K, args += K*stride; goto testK; 2714 { 2715 IRBuilder b(loopK); 2716 for (size_t i = 0; i < instructions.size(); i++) { 2717 if (!instructions[i].can_hoist && !emit(i, false, &b)) { 2718 return; 2719 } 2720 } 2721 2722 // n -= K 2723 llvm::Value* n_next = b.CreateSub(n, b.getInt32(K)); 2724 n->addIncoming(n_next, loopK); 2725 2726 // Each arg ptr += K 2727 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2728 llvm::Value* arg_next 2729 = b.CreateConstInBoundsGEP1_32(nullptr, args[i], K*fImpl->strides[i]); 2730 args[i]->addIncoming(arg_next, loopK); 2731 } 2732 b.CreateBr(testK); 2733 } 2734 2735 // hoist1: emit each hoistable scalar instruction; goto test1; 2736 { 2737 IRBuilder b(hoist1); 2738 for (size_t i = 0; i < instructions.size(); i++) { 2739 if (instructions[i].can_hoist && !emit(i, true, &b)) { 2740 return; 2741 } 2742 } 2743 b.CreateBr(test1); 2744 } 2745 2746 // test1: if (N >= 1) goto loop1; else goto leave; 2747 { 2748 IRBuilder b(test1); 2749 2750 // Set up new phi nodes for `n` and each pointer argument, now from hoist1 and loop1. 2751 llvm::PHINode* n_new = b.CreatePHI(n->getType(), 2); 2752 n_new->addIncoming(n, hoist1); 2753 n = n_new; 2754 2755 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2756 llvm::PHINode* arg_new = b.CreatePHI(args[i]->getType(), 2); 2757 arg_new->addIncoming(args[i], hoist1); 2758 args[i] = arg_new; 2759 } 2760 2761 b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(1)), loop1, leave); 2762 } 2763 2764 // loop1: ... insts on scalars; N -= 1, args += stride; goto test1; 2765 { 2766 IRBuilder b(loop1); 2767 for (size_t i = 0; i < instructions.size(); i++) { 2768 if (!instructions[i].can_hoist && !emit(i, true, &b)) { 2769 return; 2770 } 2771 } 2772 2773 // n -= 1 2774 llvm::Value* n_next = b.CreateSub(n, b.getInt32(1)); 2775 n->addIncoming(n_next, loop1); 2776 2777 // Each arg ptr += K 2778 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2779 llvm::Value* arg_next 2780 = b.CreateConstInBoundsGEP1_32(nullptr, args[i], fImpl->strides[i]); 2781 args[i]->addIncoming(arg_next, loop1); 2782 } 2783 b.CreateBr(test1); 2784 } 2785 2786 // leave: ret 2787 { 2788 IRBuilder b(leave); 2789 b.CreateRetVoid(); 2790 } 2791 2792 SkASSERT(false == llvm::verifyModule(*mod, &llvm::outs())); 2793 2794 if (true) { 2795 SkString path = SkStringPrintf("/tmp/%s.bc", debug_name); 2796 std::error_code err; 2797 llvm::raw_fd_ostream os(path.c_str(), err); 2798 if (err) { 2799 return; 2800 } 2801 llvm::WriteBitcodeToFile(*mod, os); 2802 } 2803 2804 static SkOnce once; 2805 once([]{ 2806 SkAssertResult(false == llvm::InitializeNativeTarget()); 2807 SkAssertResult(false == llvm::InitializeNativeTargetAsmPrinter()); 2808 }); 2809 2810 if (llvm::ExecutionEngine* ee = llvm::EngineBuilder(std::move(mod)) 2811 .setEngineKind(llvm::EngineKind::JIT) 2812 .setMCPU(llvm::sys::getHostCPUName()) 2813 .create()) { 2814 fImpl->llvm_ctx = std::move(ctx); 2815 fImpl->llvm_ee.reset(ee); 2816 2817 // We have to be careful here about what we close over and how, in case fImpl moves. 2818 // fImpl itself may change, but its pointee fields won't, so close over them by value. 2819 // Also, debug_name will almost certainly leave scope, so copy it. 2820 fImpl->llvm_compiling = std::async(std::launch::async, [dst = &fImpl->jit_entry, 2821 ee = fImpl->llvm_ee.get(), 2822 name = std::string(debug_name)]{ 2823 // std::atomic<void*>* dst; 2824 // llvm::ExecutionEngine* ee; 2825 // std::string name; 2826 dst->store( (void*)ee->getFunctionAddress(name.c_str()) ); 2827 }); 2828 } 2829 } 2830 #endif 2831 waitForLLVM() const2832 void Program::waitForLLVM() const { 2833 #if defined(SKVM_LLVM) 2834 if (fImpl->llvm_compiling.valid()) { 2835 fImpl->llvm_compiling.wait(); 2836 } 2837 #endif 2838 } 2839 hasJIT() const2840 bool Program::hasJIT() const { 2841 // Program::hasJIT() is really just a debugging / test aid, 2842 // so we don't mind adding a sync point here to wait for compilation. 2843 this->waitForLLVM(); 2844 2845 return fImpl->jit_entry.load() != nullptr; 2846 } 2847 dropJIT()2848 void Program::dropJIT() { 2849 #if defined(SKVM_LLVM) 2850 this->waitForLLVM(); 2851 fImpl->llvm_ee .reset(nullptr); 2852 fImpl->llvm_ctx.reset(nullptr); 2853 #elif defined(SKVM_JIT) 2854 if (fImpl->dylib) { 2855 close_dylib(fImpl->dylib); 2856 } else if (auto jit_entry = fImpl->jit_entry.load()) { 2857 unmap_jit_buffer(jit_entry, fImpl->jit_size); 2858 } 2859 #else 2860 SkASSERT(!this->hasJIT()); 2861 #endif 2862 2863 fImpl->jit_entry.store(nullptr); 2864 fImpl->jit_size = 0; 2865 fImpl->dylib = nullptr; 2866 } 2867 Program()2868 Program::Program() : fImpl(std::make_unique<Impl>()) {} 2869 ~Program()2870 Program::~Program() { 2871 // Moved-from Programs may have fImpl == nullptr. 2872 if (fImpl) { 2873 this->dropJIT(); 2874 } 2875 } 2876 Program(Program && other)2877 Program::Program(Program&& other) : fImpl(std::move(other.fImpl)) {} 2878 operator =(Program && other)2879 Program& Program::operator=(Program&& other) { 2880 fImpl = std::move(other.fImpl); 2881 return *this; 2882 } 2883 Program(const std::vector<OptimizedInstruction> & instructions,const std::vector<int> & strides,const char * debug_name,bool allow_jit)2884 Program::Program(const std::vector<OptimizedInstruction>& instructions, 2885 const std::vector<int>& strides, 2886 const char* debug_name, bool allow_jit) : Program() { 2887 fImpl->strides = strides; 2888 if (gSkVMAllowJIT && allow_jit) { 2889 #if 1 && defined(SKVM_LLVM) 2890 this->setupLLVM(instructions, debug_name); 2891 #elif 1 && defined(SKVM_JIT) 2892 this->setupJIT(instructions, debug_name); 2893 #endif 2894 } 2895 2896 // Might as well do this after setupLLVM() to get a little more time to compile. 2897 this->setupInterpreter(instructions); 2898 } 2899 instructions() const2900 std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; } nargs() const2901 int Program::nargs() const { return (int)fImpl->strides.size(); } nregs() const2902 int Program::nregs() const { return fImpl->regs; } loop() const2903 int Program::loop () const { return fImpl->loop; } empty() const2904 bool Program::empty() const { return fImpl->instructions.empty(); } 2905 2906 // Translate OptimizedInstructions to InterpreterInstructions. setupInterpreter(const std::vector<OptimizedInstruction> & instructions)2907 void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) { 2908 // Register each instruction is assigned to. 2909 std::vector<Reg> reg(instructions.size()); 2910 2911 // This next bit is a bit more complicated than strictly necessary; 2912 // we could just assign every instruction to its own register. 2913 // 2914 // But recycling registers is fairly cheap, and good practice for the 2915 // JITs where minimizing register pressure really is important. 2916 // 2917 // We have effectively infinite registers, so we hoist any value we can. 2918 // (The JIT may choose a more complex policy to reduce register pressure.) 2919 2920 fImpl->regs = 0; 2921 std::vector<Reg> avail; 2922 2923 // Assign this value to a register, recycling them where we can. 2924 auto assign_register = [&](Val id) { 2925 const OptimizedInstruction& inst = instructions[id]; 2926 2927 // If this is a real input and it's lifetime ends at this instruction, 2928 // we can recycle the register it's occupying. 2929 auto maybe_recycle_register = [&](Val input) { 2930 if (input != NA && instructions[input].death == id) { 2931 avail.push_back(reg[input]); 2932 } 2933 }; 2934 2935 // Take care to not recycle the same register twice. 2936 const Val x = inst.x, y = inst.y, z = inst.z, w = inst.w; 2937 if (true ) { maybe_recycle_register(x); } 2938 if (y != x ) { maybe_recycle_register(y); } 2939 if (z != x && z != y ) { maybe_recycle_register(z); } 2940 if (w != x && w != y && w != z) { maybe_recycle_register(w); } 2941 2942 // Instructions that die at themselves (stores) don't need a register. 2943 if (inst.death != id) { 2944 // Allocate a register if we have to, preferring to reuse anything available. 2945 if (avail.empty()) { 2946 reg[id] = fImpl->regs++; 2947 } else { 2948 reg[id] = avail.back(); 2949 avail.pop_back(); 2950 } 2951 } 2952 }; 2953 2954 // Assign a register to each hoisted instruction, then each non-hoisted loop instruction. 2955 for (Val id = 0; id < (Val)instructions.size(); id++) { 2956 if ( instructions[id].can_hoist) { assign_register(id); } 2957 } 2958 for (Val id = 0; id < (Val)instructions.size(); id++) { 2959 if (!instructions[id].can_hoist) { assign_register(id); } 2960 } 2961 2962 // Translate OptimizedInstructions to InterpreterIstructions by mapping values to 2963 // registers. This will be two passes, first hoisted instructions, then inside the loop. 2964 2965 // The loop begins at the fImpl->loop'th Instruction. 2966 fImpl->loop = 0; 2967 fImpl->instructions.reserve(instructions.size()); 2968 2969 // Add a dummy mapping for the N/A sentinel Val to any arbitrary register 2970 // so lookups don't have to know which arguments are used by which Ops. 2971 auto lookup_register = [&](Val id) { 2972 return id == NA ? (Reg)0 2973 : reg[id]; 2974 }; 2975 2976 auto push_instruction = [&](Val id, const OptimizedInstruction& inst) { 2977 InterpreterInstruction pinst{ 2978 inst.op, 2979 lookup_register(id), 2980 lookup_register(inst.x), 2981 lookup_register(inst.y), 2982 lookup_register(inst.z), 2983 lookup_register(inst.w), 2984 inst.immA, 2985 inst.immB, 2986 }; 2987 fImpl->instructions.push_back(pinst); 2988 }; 2989 2990 for (Val id = 0; id < (Val)instructions.size(); id++) { 2991 const OptimizedInstruction& inst = instructions[id]; 2992 if (inst.can_hoist) { 2993 push_instruction(id, inst); 2994 fImpl->loop++; 2995 } 2996 } 2997 for (Val id = 0; id < (Val)instructions.size(); id++) { 2998 const OptimizedInstruction& inst = instructions[id]; 2999 if (!inst.can_hoist) { 3000 push_instruction(id, inst); 3001 } 3002 } 3003 } 3004 3005 #if defined(SKVM_JIT) 3006 jit(const std::vector<OptimizedInstruction> & instructions,int * stack_hint,uint32_t * registers_used,Assembler * a) const3007 bool Program::jit(const std::vector<OptimizedInstruction>& instructions, 3008 int* stack_hint, 3009 uint32_t* registers_used, 3010 Assembler* a) const { 3011 using A = Assembler; 3012 3013 SkTHashMap<int, A::Label> constants; // Constants (mostly splats) share the same pool. 3014 A::Label iota; // Varies per lane, for Op::index. 3015 A::Label load64_index; // Used to load low or high half of 64-bit lanes. 3016 3017 // The `regs` array tracks everything we know about each register's state: 3018 // - NA: empty 3019 // - RES: reserved by ABI 3020 // - TMP: holding a temporary 3021 // - id: holding Val id 3022 constexpr Val RES = NA-1, 3023 TMP = RES-1; 3024 3025 // Map val -> stack slot. 3026 std::vector<int> stack_slot(instructions.size(), NA); 3027 int next_stack_slot = 0; 3028 3029 const int nstack_slots = *stack_hint >= 0 ? *stack_hint 3030 : stack_slot.size(); 3031 3032 #if defined(__x86_64__) || defined(_M_X64) 3033 if (!SkCpu::Supports(SkCpu::HSW)) { 3034 return false; 3035 } 3036 const int K = 8; 3037 using Reg = A::Ymm; 3038 #if defined(_M_X64) // Important to check this first; clang-cl defines both. 3039 const A::GP64 N = A::rcx, 3040 GP0 = A::rax, 3041 GP1 = A::r11, 3042 arg[] = { A::rdx, A::r8, A::r9, A::r10, A::rdi, A::rsi }; 3043 3044 // xmm6-15 need are callee-saved. 3045 std::array<Val,16> regs = { 3046 NA, NA, NA, NA, NA, NA,RES,RES, 3047 RES,RES,RES,RES, RES,RES,RES,RES, 3048 }; 3049 const uint32_t incoming_registers_used = *registers_used; 3050 3051 auto enter = [&]{ 3052 // rcx,rdx,r8,r9 are all already holding their correct values. 3053 // Load caller-saved r10 from rsp+40 if there's a fourth arg. 3054 if (fImpl->strides.size() >= 4) { 3055 a->mov(A::r10, A::Mem{A::rsp, 40}); 3056 } 3057 // Load callee-saved rdi from rsp+48 if there's a fifth arg, 3058 // first saving it to ABI reserved shadow area rsp+8. 3059 if (fImpl->strides.size() >= 5) { 3060 a->mov(A::Mem{A::rsp, 8}, A::rdi); 3061 a->mov(A::rdi, A::Mem{A::rsp, 48}); 3062 } 3063 // Load callee-saved rsi from rsp+56 if there's a sixth arg, 3064 // first saving it to ABI reserved shadow area rsp+16. 3065 if (fImpl->strides.size() >= 6) { 3066 a->mov(A::Mem{A::rsp, 16}, A::rsi); 3067 a->mov(A::rsi, A::Mem{A::rsp, 56}); 3068 } 3069 3070 // Allocate stack for our values and callee-saved xmm6-15. 3071 int stack_needed = nstack_slots*K*4; 3072 for (int r = 6; r < 16; r++) { 3073 if (incoming_registers_used & (1<<r)) { 3074 stack_needed += 16; 3075 } 3076 } 3077 if (stack_needed) { a->sub(A::rsp, stack_needed); } 3078 3079 int next_saved_xmm = nstack_slots*K*4; 3080 for (int r = 6; r < 16; r++) { 3081 if (incoming_registers_used & (1<<r)) { 3082 a->vmovups(A::Mem{A::rsp, next_saved_xmm}, (A::Xmm)r); 3083 next_saved_xmm += 16; 3084 regs[r] = NA; 3085 } 3086 } 3087 }; 3088 auto exit = [&]{ 3089 // The second pass of jit() shouldn't use any register it didn't in the first pass. 3090 SkASSERT((*registers_used & incoming_registers_used) == *registers_used); 3091 3092 // Restore callee-saved xmm6-15 and the stack pointer. 3093 int stack_used = nstack_slots*K*4; 3094 for (int r = 6; r < 16; r++) { 3095 if (incoming_registers_used & (1<<r)) { 3096 a->vmovups((A::Xmm)r, A::Mem{A::rsp, stack_used}); 3097 stack_used += 16; 3098 } 3099 } 3100 if (stack_used) { a->add(A::rsp, stack_used); } 3101 3102 // Restore callee-saved rdi/rsi if we used them. 3103 if (fImpl->strides.size() >= 5) { 3104 a->mov(A::rdi, A::Mem{A::rsp, 8}); 3105 } 3106 if (fImpl->strides.size() >= 6) { 3107 a->mov(A::rsi, A::Mem{A::rsp, 16}); 3108 } 3109 3110 a->vzeroupper(); 3111 a->ret(); 3112 }; 3113 #elif defined(__x86_64__) 3114 const A::GP64 N = A::rdi, 3115 GP0 = A::rax, 3116 GP1 = A::r11, 3117 arg[] = { A::rsi, A::rdx, A::rcx, A::r8, A::r9, A::r10 }; 3118 3119 // All 16 ymm registers are available to use. 3120 std::array<Val,16> regs = { 3121 NA,NA,NA,NA, NA,NA,NA,NA, 3122 NA,NA,NA,NA, NA,NA,NA,NA, 3123 }; 3124 3125 auto enter = [&]{ 3126 // Load caller-saved r10 from rsp+8 if there's a sixth arg. 3127 if (fImpl->strides.size() >= 6) { 3128 a->mov(A::r10, A::Mem{A::rsp, 8}); 3129 } 3130 if (nstack_slots) { a->sub(A::rsp, nstack_slots*K*4); } 3131 }; 3132 auto exit = [&]{ 3133 if (nstack_slots) { a->add(A::rsp, nstack_slots*K*4); } 3134 a->vzeroupper(); 3135 a->ret(); 3136 }; 3137 #endif 3138 3139 auto load_from_memory = [&](Reg r, Val v) { 3140 if (instructions[v].op == Op::splat) { 3141 if (instructions[v].immA == 0) { 3142 a->vpxor(r,r,r); 3143 } else { 3144 a->vmovups(r, constants.find(instructions[v].immA)); 3145 } 3146 } else { 3147 SkASSERT(stack_slot[v] != NA); 3148 a->vmovups(r, A::Mem{A::rsp, stack_slot[v]*K*4}); 3149 } 3150 }; 3151 auto store_to_stack = [&](Reg r, Val v) { 3152 SkASSERT(next_stack_slot < nstack_slots); 3153 stack_slot[v] = next_stack_slot++; 3154 a->vmovups(A::Mem{A::rsp, stack_slot[v]*K*4}, r); 3155 }; 3156 #elif defined(__aarch64__) 3157 const int K = 4; 3158 using Reg = A::V; 3159 const A::X N = A::x0, 3160 GP0 = A::x8, 3161 GP1 = A::x9, 3162 arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 }; 3163 3164 // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15 in enter/exit. 3165 std::array<Val,32> regs = { 3166 NA, NA, NA, NA, NA, NA, NA, NA, 3167 RES,RES,RES,RES, RES,RES,RES,RES, 3168 NA, NA, NA, NA, NA, NA, NA, NA, 3169 NA, NA, NA, NA, NA, NA, NA, NA, 3170 }; 3171 3172 auto enter = [&]{ if (nstack_slots) { a->sub(A::sp, A::sp, nstack_slots*K*4); } }; 3173 auto exit = [&]{ if (nstack_slots) { a->add(A::sp, A::sp, nstack_slots*K*4); } 3174 a->ret(A::x30); }; 3175 3176 auto load_from_memory = [&](Reg r, Val v) { 3177 if (instructions[v].op == Op::splat) { 3178 if (instructions[v].immA == 0) { 3179 a->eor16b(r,r,r); 3180 } else { 3181 a->ldrq(r, constants.find(instructions[v].immA)); 3182 } 3183 } else { 3184 SkASSERT(stack_slot[v] != NA); 3185 a->ldrq(r, A::sp, stack_slot[v]); 3186 } 3187 }; 3188 auto store_to_stack = [&](Reg r, Val v) { 3189 SkASSERT(next_stack_slot < nstack_slots); 3190 stack_slot[v] = next_stack_slot++; 3191 a->strq(r, A::sp, stack_slot[v]); 3192 }; 3193 #endif 3194 3195 *registers_used = 0; // We'll update this as we go. 3196 3197 if (SK_ARRAY_COUNT(arg) < fImpl->strides.size()) { 3198 return false; 3199 } 3200 3201 auto emit = [&](Val id, bool scalar) { 3202 const int active_lanes = scalar ? 1 : K; 3203 const OptimizedInstruction& inst = instructions[id]; 3204 const Op op = inst.op; 3205 const Val x = inst.x, 3206 y = inst.y, 3207 z = inst.z, 3208 w = inst.w; 3209 const int immA = inst.immA, 3210 immB = inst.immB; 3211 3212 // alloc_tmp() returns the first of N adjacent temporary registers, 3213 // each freed manually with free_tmp() or noted as our result with mark_tmp_as_dst(). 3214 auto alloc_tmp = [&](int N=1) -> Reg { 3215 auto needs_spill = [&](Val v) -> bool { 3216 SkASSERT(v >= 0); // {NA,TMP,RES} need to be handled before calling this. 3217 return stack_slot[v] == NA // We haven't spilled it already? 3218 && instructions[v].op != Op::splat; // No need to spill constants. 3219 }; 3220 3221 // We want to find a block of N adjacent registers requiring the fewest spills. 3222 int best_block = -1, 3223 min_spills = 0x7fff'ffff; 3224 for (int block = 0; block+N <= (int)regs.size(); block++) { 3225 int spills = 0; 3226 for (int r = block; r < block+N; r++) { 3227 Val v = regs[r]; 3228 // Registers holding NA (nothing) are ideal, nothing to spill. 3229 if (v == NA) { 3230 continue; 3231 } 3232 // We can't spill anything REServed or that we'll need this instruction. 3233 if (v == RES || 3234 v == TMP || v == id || v == x || v == y || v == z || v == w) { 3235 spills = 0x7fff'ffff; 3236 block = r; // (optimization) continue outer loop at next register. 3237 break; 3238 } 3239 // Usually here we've got a value v that we'd have to spill to the stack 3240 // before reusing its register, but sometimes even now we get a freebie. 3241 spills += needs_spill(v) ? 1 : 0; 3242 } 3243 3244 // TODO: non-arbitrary tie-breaking? 3245 if (min_spills > spills) { 3246 min_spills = spills; 3247 best_block = block; 3248 } 3249 if (min_spills == 0) { 3250 break; // (optimization) stop early if we find an unbeatable block. 3251 } 3252 } 3253 3254 // TODO: our search's success isn't obviously guaranteed... it depends on N 3255 // and the number and relative position in regs of any unspillable values. 3256 // I think we should be able to get away with N≤2 on x86-64 and N≤4 on arm64; 3257 // we'll need to revisit this logic should this assert fire. 3258 SkASSERT(min_spills <= N); 3259 3260 // Spill what needs spilling, and mark the block all as TMP. 3261 for (int r = best_block; r < best_block+N; r++) { 3262 Val& v = regs[r]; 3263 *registers_used |= (1<<r); 3264 3265 SkASSERT(v == NA || v >= 0); 3266 if (v >= 0 && needs_spill(v)) { 3267 store_to_stack((Reg)r, v); 3268 SkASSERT(!needs_spill(v)); 3269 min_spills--; 3270 } 3271 3272 v = TMP; 3273 } 3274 SkASSERT(min_spills == 0); 3275 return (Reg)best_block; 3276 }; 3277 3278 auto free_tmp = [&](Reg r) { 3279 SkASSERT(regs[r] == TMP); 3280 regs[r] = NA; 3281 }; 3282 3283 // Which register holds dst,x,y,z,w for this instruction? NA if none does yet. 3284 int rd = NA, 3285 rx = NA, 3286 ry = NA, 3287 rz = NA, 3288 rw = NA; 3289 3290 auto update_regs = [&](Reg r, Val v) { 3291 if (v == id) { rd = r; } 3292 if (v == x) { rx = r; } 3293 if (v == y) { ry = r; } 3294 if (v == z) { rz = r; } 3295 if (v == w) { rw = r; } 3296 return r; 3297 }; 3298 3299 auto find_existing_reg = [&](Val v) -> int { 3300 // Quick-check our working registers. 3301 if (v == id && rd != NA) { return rd; } 3302 if (v == x && rx != NA) { return rx; } 3303 if (v == y && ry != NA) { return ry; } 3304 if (v == z && rz != NA) { return rz; } 3305 if (v == w && rw != NA) { return rw; } 3306 3307 // Search inter-instruction register map. 3308 for (auto [r,val] : SkMakeEnumerate(regs)) { 3309 if (val == v) { 3310 return update_regs((Reg)r, v); 3311 } 3312 } 3313 return NA; 3314 }; 3315 3316 // Return a register for Val, holding that value if it already exists. 3317 // During this instruction all calls to r(v) will return the same register. 3318 auto r = [&](Val v) -> Reg { 3319 SkASSERT(v >= 0); 3320 3321 if (int found = find_existing_reg(v); found != NA) { 3322 return (Reg)found; 3323 } 3324 3325 Reg r = alloc_tmp(); 3326 SkASSERT(regs[r] == TMP); 3327 3328 SkASSERT(v <= id); 3329 if (v < id) { 3330 // If v < id, we're loading one of this instruction's inputs. 3331 // If v == id we're just allocating its destination register. 3332 load_from_memory(r, v); 3333 } 3334 regs[r] = v; 3335 return update_regs(r, v); 3336 }; 3337 3338 auto dies_here = [&](Val v) -> bool { 3339 SkASSERT(v >= 0); 3340 return instructions[v].death == id; 3341 }; 3342 3343 // Alias dst() to r(v) if dies_here(v). 3344 auto try_alias = [&](Val v) -> bool { 3345 SkASSERT(v == x || v == y || v == z || v == w); 3346 if (dies_here(v)) { 3347 rd = r(v); // Vals v and id share a register for this instruction. 3348 regs[rd] = id; // Next instruction, Val id will be in the register, not Val v. 3349 return true; 3350 } 3351 return false; 3352 }; 3353 3354 // Generally r(id), 3355 // but with a hint, try to alias dst() to r(v) if dies_here(v). 3356 auto dst = [&](Val hint1 = NA, Val hint2 = NA) -> Reg { 3357 if (hint1 != NA && try_alias(hint1)) { return r(id); } 3358 if (hint2 != NA && try_alias(hint2)) { return r(id); } 3359 return r(id); 3360 }; 3361 3362 #if defined(__aarch64__) // Nothing sneaky, just unused on x86-64. 3363 auto mark_tmp_as_dst = [&](Reg tmp) { 3364 SkASSERT(regs[tmp] == TMP); 3365 rd = tmp; 3366 regs[rd] = id; 3367 SkASSERT(dst() == tmp); 3368 }; 3369 #endif 3370 3371 #if defined(__x86_64__) || defined(_M_X64) 3372 // On x86 we can work with many values directly from the stack or program constant pool. 3373 auto any = [&](Val v) -> A::Operand { 3374 SkASSERT(v >= 0); 3375 SkASSERT(v < id); 3376 3377 if (int found = find_existing_reg(v); found != NA) { 3378 return (Reg)found; 3379 } 3380 if (instructions[v].op == Op::splat) { 3381 return constants.find(instructions[v].immA); 3382 } 3383 return A::Mem{A::rsp, stack_slot[v]*K*4}; 3384 }; 3385 3386 // This is never really worth asking except when any() might be used; 3387 // if we need this value in ARM, might as well just call r(v) to get it into a register. 3388 auto in_reg = [&](Val v) -> bool { 3389 return find_existing_reg(v) != NA; 3390 }; 3391 #endif 3392 3393 switch (op) { 3394 // Make sure splat constants can be found by load_from_memory() or any(). 3395 case Op::splat: 3396 (void)constants[immA]; 3397 break; 3398 3399 #if defined(__x86_64__) || defined(_M_X64) 3400 case Op::assert_true: { 3401 a->vptest (r(x), &constants[0xffffffff]); 3402 A::Label all_true; 3403 a->jc(&all_true); 3404 a->int3(); 3405 a->label(&all_true); 3406 } break; 3407 3408 case Op::store8: 3409 if (scalar) { 3410 a->vpextrb(A::Mem{arg[immA]}, (A::Xmm)r(x), 0); 3411 } else { 3412 a->vpackusdw(dst(x), r(x), r(x)); 3413 a->vpermq (dst(), dst(), 0xd8); 3414 a->vpackuswb(dst(), dst(), dst()); 3415 a->vmovq (A::Mem{arg[immA]}, (A::Xmm)dst()); 3416 } break; 3417 3418 case Op::store16: 3419 if (scalar) { 3420 a->vpextrw(A::Mem{arg[immA]}, (A::Xmm)r(x), 0); 3421 } else { 3422 a->vpackusdw(dst(x), r(x), r(x)); 3423 a->vpermq (dst(), dst(), 0xd8); 3424 a->vmovups (A::Mem{arg[immA]}, (A::Xmm)dst()); 3425 } break; 3426 3427 case Op::store32: if (scalar) { a->vmovd (A::Mem{arg[immA]}, (A::Xmm)r(x)); } 3428 else { a->vmovups(A::Mem{arg[immA]}, r(x)); } 3429 break; 3430 3431 case Op::store64: if (scalar) { 3432 a->vmovd(A::Mem{arg[immA],0}, (A::Xmm)r(x)); 3433 a->vmovd(A::Mem{arg[immA],4}, (A::Xmm)r(y)); 3434 } else { 3435 // r(x) = {a,b,c,d|e,f,g,h} 3436 // r(y) = {i,j,k,l|m,n,o,p} 3437 // We want to write a,i,b,j,c,k,d,l,e,m... 3438 A::Ymm L = alloc_tmp(), 3439 H = alloc_tmp(); 3440 a->vpunpckldq(L, r(x), any(y)); // L = {a,i,b,j|e,m,f,n} 3441 a->vpunpckhdq(H, r(x), any(y)); // H = {c,k,d,l|g,o,h,p} 3442 a->vperm2f128(dst(), L,H, 0x20); // = {a,i,b,j|c,k,d,l} 3443 a->vmovups(A::Mem{arg[immA], 0}, dst()); 3444 a->vperm2f128(dst(), L,H, 0x31); // = {e,m,f,n|g,o,h,p} 3445 a->vmovups(A::Mem{arg[immA],32}, dst()); 3446 free_tmp(L); 3447 free_tmp(H); 3448 } break; 3449 3450 case Op::store128: { 3451 // TODO: >32-bit stores 3452 a->vmovd (A::Mem{arg[immA], 0*16 + 0}, (A::Xmm)r(x) ); 3453 a->vmovd (A::Mem{arg[immA], 0*16 + 4}, (A::Xmm)r(y) ); 3454 a->vmovd (A::Mem{arg[immA], 0*16 + 8}, (A::Xmm)r(z) ); 3455 a->vmovd (A::Mem{arg[immA], 0*16 + 12}, (A::Xmm)r(w) ); 3456 if (scalar) { break; } 3457 3458 a->vpextrd(A::Mem{arg[immA], 1*16 + 0}, (A::Xmm)r(x), 1); 3459 a->vpextrd(A::Mem{arg[immA], 1*16 + 4}, (A::Xmm)r(y), 1); 3460 a->vpextrd(A::Mem{arg[immA], 1*16 + 8}, (A::Xmm)r(z), 1); 3461 a->vpextrd(A::Mem{arg[immA], 1*16 + 12}, (A::Xmm)r(w), 1); 3462 3463 a->vpextrd(A::Mem{arg[immA], 2*16 + 0}, (A::Xmm)r(x), 2); 3464 a->vpextrd(A::Mem{arg[immA], 2*16 + 4}, (A::Xmm)r(y), 2); 3465 a->vpextrd(A::Mem{arg[immA], 2*16 + 8}, (A::Xmm)r(z), 2); 3466 a->vpextrd(A::Mem{arg[immA], 2*16 + 12}, (A::Xmm)r(w), 2); 3467 3468 a->vpextrd(A::Mem{arg[immA], 3*16 + 0}, (A::Xmm)r(x), 3); 3469 a->vpextrd(A::Mem{arg[immA], 3*16 + 4}, (A::Xmm)r(y), 3); 3470 a->vpextrd(A::Mem{arg[immA], 3*16 + 8}, (A::Xmm)r(z), 3); 3471 a->vpextrd(A::Mem{arg[immA], 3*16 + 12}, (A::Xmm)r(w), 3); 3472 // Now we need to store the upper 128 bits of x,y,z,w. 3473 // Storing in this order rather than interlacing minimizes temporaries. 3474 a->vextracti128(dst(), r(x), 1); 3475 a->vmovd (A::Mem{arg[immA], 4*16 + 0}, (A::Xmm)dst() ); 3476 a->vpextrd(A::Mem{arg[immA], 5*16 + 0}, (A::Xmm)dst(), 1); 3477 a->vpextrd(A::Mem{arg[immA], 6*16 + 0}, (A::Xmm)dst(), 2); 3478 a->vpextrd(A::Mem{arg[immA], 7*16 + 0}, (A::Xmm)dst(), 3); 3479 3480 a->vextracti128(dst(), r(y), 1); 3481 a->vmovd (A::Mem{arg[immA], 4*16 + 4}, (A::Xmm)dst() ); 3482 a->vpextrd(A::Mem{arg[immA], 5*16 + 4}, (A::Xmm)dst(), 1); 3483 a->vpextrd(A::Mem{arg[immA], 6*16 + 4}, (A::Xmm)dst(), 2); 3484 a->vpextrd(A::Mem{arg[immA], 7*16 + 4}, (A::Xmm)dst(), 3); 3485 3486 a->vextracti128(dst(), r(z), 1); 3487 a->vmovd (A::Mem{arg[immA], 4*16 + 8}, (A::Xmm)dst() ); 3488 a->vpextrd(A::Mem{arg[immA], 5*16 + 8}, (A::Xmm)dst(), 1); 3489 a->vpextrd(A::Mem{arg[immA], 6*16 + 8}, (A::Xmm)dst(), 2); 3490 a->vpextrd(A::Mem{arg[immA], 7*16 + 8}, (A::Xmm)dst(), 3); 3491 3492 a->vextracti128(dst(), r(w), 1); 3493 a->vmovd (A::Mem{arg[immA], 4*16 + 12}, (A::Xmm)dst() ); 3494 a->vpextrd(A::Mem{arg[immA], 5*16 + 12}, (A::Xmm)dst(), 1); 3495 a->vpextrd(A::Mem{arg[immA], 6*16 + 12}, (A::Xmm)dst(), 2); 3496 a->vpextrd(A::Mem{arg[immA], 7*16 + 12}, (A::Xmm)dst(), 3); 3497 } break; 3498 3499 case Op::load8: if (scalar) { 3500 a->vpxor (dst(), dst(), dst()); 3501 a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0); 3502 } else { 3503 a->vpmovzxbd(dst(), A::Mem{arg[immA]}); 3504 } break; 3505 3506 case Op::load16: if (scalar) { 3507 a->vpxor (dst(), dst(), dst()); 3508 a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0); 3509 } else { 3510 a->vpmovzxwd(dst(), A::Mem{arg[immA]}); 3511 } break; 3512 3513 case Op::load32: if (scalar) { a->vmovd ((A::Xmm)dst(), A::Mem{arg[immA]}); } 3514 else { a->vmovups( dst(), A::Mem{arg[immA]}); } 3515 break; 3516 3517 case Op::load64: if (scalar) { 3518 a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB}); 3519 } else { 3520 A::Ymm tmp = alloc_tmp(); 3521 a->vmovups(tmp, &load64_index); 3522 a->vpermps(dst(), tmp, A::Mem{arg[immA], 0}); 3523 a->vpermps( tmp, tmp, A::Mem{arg[immA], 32}); 3524 // Low 128 bits holds immB=0 lanes, high 128 bits holds immB=1. 3525 a->vperm2f128(dst(), dst(),tmp, immB ? 0x31 : 0x20); 3526 free_tmp(tmp); 3527 } break; 3528 3529 case Op::load128: if (scalar) { 3530 a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB}); 3531 } else { 3532 // Load 4 low values into xmm tmp, 3533 A::Ymm tmp = alloc_tmp(); 3534 A::Xmm t = (A::Xmm)tmp; 3535 a->vmovd (t, A::Mem{arg[immA], 0*16 + 4*immB} ); 3536 a->vpinsrd(t,t, A::Mem{arg[immA], 1*16 + 4*immB}, 1); 3537 a->vpinsrd(t,t, A::Mem{arg[immA], 2*16 + 4*immB}, 2); 3538 a->vpinsrd(t,t, A::Mem{arg[immA], 3*16 + 4*immB}, 3); 3539 3540 // Load 4 high values into xmm dst(), 3541 A::Xmm d = (A::Xmm)dst(); 3542 a->vmovd (d, A::Mem{arg[immA], 4*16 + 4*immB} ); 3543 a->vpinsrd(d,d, A::Mem{arg[immA], 5*16 + 4*immB}, 1); 3544 a->vpinsrd(d,d, A::Mem{arg[immA], 6*16 + 4*immB}, 2); 3545 a->vpinsrd(d,d, A::Mem{arg[immA], 7*16 + 4*immB}, 3); 3546 3547 // Merge the two, ymm dst() = {xmm tmp|xmm dst()} 3548 a->vperm2f128(dst(), tmp,dst(), 0x20); 3549 free_tmp(tmp); 3550 } break; 3551 3552 case Op::gather8: { 3553 // As usual, the gather base pointer is immB bytes off of uniform immA. 3554 a->mov(GP0, A::Mem{arg[immA], immB}); 3555 3556 A::Ymm tmp = alloc_tmp(); 3557 a->vmovups(tmp, any(x)); 3558 3559 for (int i = 0; i < active_lanes; i++) { 3560 if (i == 4) { 3561 // vpextrd can only pluck indices out from an Xmm register, 3562 // so we manually swap over to the top when we're halfway through. 3563 a->vextracti128((A::Xmm)tmp, tmp, 1); 3564 } 3565 a->vpextrd(GP1, (A::Xmm)tmp, i%4); 3566 a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::ONE}, i); 3567 } 3568 a->vpmovzxbd(dst(), dst()); 3569 free_tmp(tmp); 3570 } break; 3571 3572 case Op::gather16: { 3573 // Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd. 3574 a->mov(GP0, A::Mem{arg[immA], immB}); 3575 3576 A::Ymm tmp = alloc_tmp(); 3577 a->vmovups(tmp, any(x)); 3578 3579 for (int i = 0; i < active_lanes; i++) { 3580 if (i == 4) { 3581 a->vextracti128((A::Xmm)tmp, tmp, 1); 3582 } 3583 a->vpextrd(GP1, (A::Xmm)tmp, i%4); 3584 a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::TWO}, i); 3585 } 3586 a->vpmovzxwd(dst(), dst()); 3587 free_tmp(tmp); 3588 } break; 3589 3590 case Op::gather32: 3591 if (scalar) { 3592 // Our gather base pointer is immB bytes off of uniform immA. 3593 a->mov(GP0, A::Mem{arg[immA], immB}); 3594 3595 // Grab our index from lane 0 of the index argument. 3596 a->vmovd(GP1, (A::Xmm)r(x)); 3597 3598 // dst = *(base + 4*index) 3599 a->vmovd((A::Xmm)dst(x), A::Mem{GP0, 0, GP1, A::FOUR}); 3600 } else { 3601 a->mov(GP0, A::Mem{arg[immA], immB}); 3602 3603 A::Ymm mask = alloc_tmp(); 3604 a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.) 3605 3606 a->vgatherdps(dst(), A::FOUR, r(x), GP0, mask); 3607 free_tmp(mask); 3608 } 3609 break; 3610 3611 case Op::uniform32: a->vbroadcastss(dst(), A::Mem{arg[immA], immB}); 3612 break; 3613 3614 case Op::index: a->vmovd((A::Xmm)dst(), N); 3615 a->vbroadcastss(dst(), dst()); 3616 a->vpsubd(dst(), dst(), &iota); 3617 break; 3618 3619 // We can swap the arguments of symmetric instructions to make better use of any(). 3620 case Op::add_f32: 3621 if (in_reg(x)) { a->vaddps(dst(x), r(x), any(y)); } 3622 else { a->vaddps(dst(y), r(y), any(x)); } 3623 break; 3624 3625 case Op::mul_f32: 3626 if (in_reg(x)) { a->vmulps(dst(x), r(x), any(y)); } 3627 else { a->vmulps(dst(y), r(y), any(x)); } 3628 break; 3629 3630 case Op::sub_f32: a->vsubps(dst(x), r(x), any(y)); break; 3631 case Op::div_f32: a->vdivps(dst(x), r(x), any(y)); break; 3632 case Op::min_f32: a->vminps(dst(y), r(y), any(x)); break; // Order matters, 3633 case Op::max_f32: a->vmaxps(dst(y), r(y), any(x)); break; // see test SkVM_min_max. 3634 3635 case Op::fma_f32: 3636 if (try_alias(x)) { a->vfmadd132ps(dst(x), r(z), any(y)); } else 3637 if (try_alias(y)) { a->vfmadd213ps(dst(y), r(x), any(z)); } else 3638 if (try_alias(z)) { a->vfmadd231ps(dst(z), r(x), any(y)); } else 3639 { a->vmovups (dst(), any(x)); 3640 a->vfmadd132ps(dst(), r(z), any(y)); } 3641 break; 3642 3643 case Op::fms_f32: 3644 if (try_alias(x)) { a->vfmsub132ps(dst(x), r(z), any(y)); } else 3645 if (try_alias(y)) { a->vfmsub213ps(dst(y), r(x), any(z)); } else 3646 if (try_alias(z)) { a->vfmsub231ps(dst(z), r(x), any(y)); } else 3647 { a->vmovups (dst(), any(x)); 3648 a->vfmsub132ps(dst(), r(z), any(y)); } 3649 break; 3650 3651 case Op::fnma_f32: 3652 if (try_alias(x)) { a->vfnmadd132ps(dst(x), r(z), any(y)); } else 3653 if (try_alias(y)) { a->vfnmadd213ps(dst(y), r(x), any(z)); } else 3654 if (try_alias(z)) { a->vfnmadd231ps(dst(z), r(x), any(y)); } else 3655 { a->vmovups (dst(), any(x)); 3656 a->vfnmadd132ps(dst(), r(z), any(y)); } 3657 break; 3658 3659 // In situations like this we want to try aliasing dst(x) when x is 3660 // already in a register, but not if we'd have to load it from the stack 3661 // just to alias it. That's done better directly into the new register. 3662 case Op::sqrt_f32: 3663 if (in_reg(x)) { a->vsqrtps(dst(x), r(x)); } 3664 else { a->vsqrtps(dst(), any(x)); } 3665 break; 3666 3667 case Op::add_i32: 3668 if (in_reg(x)) { a->vpaddd(dst(x), r(x), any(y)); } 3669 else { a->vpaddd(dst(y), r(y), any(x)); } 3670 break; 3671 3672 case Op::mul_i32: 3673 if (in_reg(x)) { a->vpmulld(dst(x), r(x), any(y)); } 3674 else { a->vpmulld(dst(y), r(y), any(x)); } 3675 break; 3676 3677 case Op::sub_i32: a->vpsubd(dst(x), r(x), any(y)); break; 3678 3679 case Op::bit_and: 3680 if (in_reg(x)) { a->vpand(dst(x), r(x), any(y)); } 3681 else { a->vpand(dst(y), r(y), any(x)); } 3682 break; 3683 case Op::bit_or: 3684 if (in_reg(x)) { a->vpor(dst(x), r(x), any(y)); } 3685 else { a->vpor(dst(y), r(y), any(x)); } 3686 break; 3687 case Op::bit_xor: 3688 if (in_reg(x)) { a->vpxor(dst(x), r(x), any(y)); } 3689 else { a->vpxor(dst(y), r(y), any(x)); } 3690 break; 3691 3692 case Op::bit_clear: a->vpandn(dst(y), r(y), any(x)); break; // Notice, y then x. 3693 3694 case Op::select: 3695 if (try_alias(z)) { a->vpblendvb(dst(z), r(z), any(y), r(x)); } 3696 else { a->vpblendvb(dst(x), r(z), any(y), r(x)); } 3697 break; 3698 3699 case Op::shl_i32: a->vpslld(dst(x), r(x), immA); break; 3700 case Op::shr_i32: a->vpsrld(dst(x), r(x), immA); break; 3701 case Op::sra_i32: a->vpsrad(dst(x), r(x), immA); break; 3702 3703 case Op::eq_i32: 3704 if (in_reg(x)) { a->vpcmpeqd(dst(x), r(x), any(y)); } 3705 else { a->vpcmpeqd(dst(y), r(y), any(x)); } 3706 break; 3707 3708 case Op::gt_i32: a->vpcmpgtd(dst(), r(x), any(y)); break; 3709 3710 case Op::eq_f32: 3711 if (in_reg(x)) { a->vcmpeqps(dst(x), r(x), any(y)); } 3712 else { a->vcmpeqps(dst(y), r(y), any(x)); } 3713 break; 3714 case Op::neq_f32: 3715 if (in_reg(x)) { a->vcmpneqps(dst(x), r(x), any(y)); } 3716 else { a->vcmpneqps(dst(y), r(y), any(x)); } 3717 break; 3718 3719 case Op:: gt_f32: a->vcmpltps (dst(y), r(y), any(x)); break; 3720 case Op::gte_f32: a->vcmpleps (dst(y), r(y), any(x)); break; 3721 3722 case Op::ceil: 3723 if (in_reg(x)) { a->vroundps(dst(x), r(x), Assembler::CEIL); } 3724 else { a->vroundps(dst(), any(x), Assembler::CEIL); } 3725 break; 3726 3727 case Op::floor: 3728 if (in_reg(x)) { a->vroundps(dst(x), r(x), Assembler::FLOOR); } 3729 else { a->vroundps(dst(), any(x), Assembler::FLOOR); } 3730 break; 3731 3732 case Op::to_f32: 3733 if (in_reg(x)) { a->vcvtdq2ps(dst(x), r(x)); } 3734 else { a->vcvtdq2ps(dst(), any(x)); } 3735 break; 3736 3737 case Op::trunc: 3738 if (in_reg(x)) { a->vcvttps2dq(dst(x), r(x)); } 3739 else { a->vcvttps2dq(dst(), any(x)); } 3740 break; 3741 3742 case Op::round: 3743 if (in_reg(x)) { a->vcvtps2dq(dst(x), r(x)); } 3744 else { a->vcvtps2dq(dst(), any(x)); } 3745 break; 3746 3747 case Op::to_fp16: 3748 a->vcvtps2ph(dst(x), r(x), A::CURRENT); // f32 ymm -> f16 xmm 3749 a->vpmovzxwd(dst(), dst()); // f16 xmm -> f16 ymm 3750 break; 3751 3752 case Op::from_fp16: 3753 a->vpackusdw(dst(x), r(x), r(x)); // f16 ymm -> f16 xmm 3754 a->vpermq (dst(), dst(), 0xd8); // swap middle two 64-bit lanes 3755 a->vcvtph2ps(dst(), dst()); // f16 xmm -> f32 ymm 3756 break; 3757 3758 #elif defined(__aarch64__) 3759 case Op::assert_true: { 3760 a->uminv4s(dst(), r(x)); // uminv acts like an all() across the vector. 3761 a->movs(GP0, dst(), 0); 3762 A::Label all_true; 3763 a->cbnz(GP0, &all_true); 3764 a->brk(0); 3765 a->label(&all_true); 3766 } break; 3767 3768 case Op::index: { 3769 A::V tmp = alloc_tmp(); 3770 a->ldrq (tmp, &iota); 3771 a->dup4s(dst(), N); 3772 a->sub4s(dst(), dst(), tmp); 3773 free_tmp(tmp); 3774 } break; 3775 3776 case Op::store8: a->xtns2h(dst(x), r(x)); 3777 a->xtnh2b(dst(), dst()); 3778 if (scalar) { a->strb (dst(), arg[immA]); } 3779 else { a->strs (dst(), arg[immA]); } 3780 break; 3781 3782 case Op::store16: a->xtns2h(dst(x), r(x)); 3783 if (scalar) { a->strh (dst(), arg[immA]); } 3784 else { a->strd (dst(), arg[immA]); } 3785 break; 3786 3787 case Op::store32: if (scalar) { a->strs(r(x), arg[immA]); } 3788 else { a->strq(r(x), arg[immA]); } 3789 break; 3790 3791 case Op::store64: if (scalar) { 3792 a->strs(r(x), arg[immA], 0); 3793 a->strs(r(y), arg[immA], 1); 3794 } else if (r(y) == r(x)+1) { 3795 a->st24s(r(x), arg[immA]); 3796 } else { 3797 Reg tmp0 = alloc_tmp(2), 3798 tmp1 = (Reg)(tmp0+1); 3799 a->orr16b(tmp0, r(x), r(x)); 3800 a->orr16b(tmp1, r(y), r(y)); 3801 a-> st24s(tmp0, arg[immA]); 3802 free_tmp(tmp0); 3803 free_tmp(tmp1); 3804 } break; 3805 3806 case Op::store128: 3807 if (scalar) { 3808 a->strs(r(x), arg[immA], 0); 3809 a->strs(r(y), arg[immA], 1); 3810 a->strs(r(z), arg[immA], 2); 3811 a->strs(r(w), arg[immA], 3); 3812 } else if (r(y) == r(x)+1 && 3813 r(z) == r(x)+2 && 3814 r(w) == r(x)+3) { 3815 a->st44s(r(x), arg[immA]); 3816 } else { 3817 Reg tmp0 = alloc_tmp(4), 3818 tmp1 = (Reg)(tmp0+1), 3819 tmp2 = (Reg)(tmp0+2), 3820 tmp3 = (Reg)(tmp0+3); 3821 a->orr16b(tmp0, r(x), r(x)); 3822 a->orr16b(tmp1, r(y), r(y)); 3823 a->orr16b(tmp2, r(z), r(z)); 3824 a->orr16b(tmp3, r(w), r(w)); 3825 a-> st44s(tmp0, arg[immA]); 3826 free_tmp(tmp0); 3827 free_tmp(tmp1); 3828 free_tmp(tmp2); 3829 free_tmp(tmp3); 3830 } break; 3831 3832 3833 case Op::load8: if (scalar) { a->ldrb(dst(), arg[immA]); } 3834 else { a->ldrs(dst(), arg[immA]); } 3835 a->uxtlb2h(dst(), dst()); 3836 a->uxtlh2s(dst(), dst()); 3837 break; 3838 3839 case Op::load16: if (scalar) { a->ldrh(dst(), arg[immA]); } 3840 else { a->ldrd(dst(), arg[immA]); } 3841 a->uxtlh2s(dst(), dst()); 3842 break; 3843 3844 case Op::load32: if (scalar) { a->ldrs(dst(), arg[immA]); } 3845 else { a->ldrq(dst(), arg[immA]); } 3846 break; 3847 3848 case Op::load64: if (scalar) { 3849 a->ldrs(dst(), arg[immA], immB); 3850 } else { 3851 Reg tmp0 = alloc_tmp(2), 3852 tmp1 = (Reg)(tmp0+1); 3853 a->ld24s(tmp0, arg[immA]); 3854 // TODO: return both 3855 switch (immB) { 3856 case 0: mark_tmp_as_dst(tmp0); free_tmp(tmp1); break; 3857 case 1: mark_tmp_as_dst(tmp1); free_tmp(tmp0); break; 3858 } 3859 } break; 3860 3861 case Op::load128: if (scalar) { 3862 a->ldrs(dst(), arg[immA], immB); 3863 } else { 3864 Reg tmp0 = alloc_tmp(4), 3865 tmp1 = (Reg)(tmp0+1), 3866 tmp2 = (Reg)(tmp0+2), 3867 tmp3 = (Reg)(tmp0+3); 3868 a->ld44s(tmp0, arg[immA]); 3869 // TODO: return all four 3870 switch (immB) { 3871 case 0: mark_tmp_as_dst(tmp0); break; 3872 case 1: mark_tmp_as_dst(tmp1); break; 3873 case 2: mark_tmp_as_dst(tmp2); break; 3874 case 3: mark_tmp_as_dst(tmp3); break; 3875 } 3876 if (immB != 0) { free_tmp(tmp0); } 3877 if (immB != 1) { free_tmp(tmp1); } 3878 if (immB != 2) { free_tmp(tmp2); } 3879 if (immB != 3) { free_tmp(tmp3); } 3880 } break; 3881 3882 case Op::uniform32: a->add(GP0, arg[immA], immB); 3883 a->ld1r4s(dst(), GP0); 3884 break; 3885 3886 case Op::gather8: { 3887 // As usual, the gather base pointer is immB bytes off of uniform immA. 3888 a->add (GP0, arg[immA], immB); // GP0 = &(gather base pointer) 3889 a->ldrd(GP0, GP0); // GP0 = gather base pointer 3890 3891 for (int i = 0; i < active_lanes; i++) { 3892 a->movs(GP1, r(x), i); // Extract index lane i into GP1. 3893 a->add (GP1, GP0, GP1); // Add the gather base pointer. 3894 a->ldrb(GP1, GP1); // Load that byte. 3895 a->inss(dst(x), GP1, i); // Insert it into dst() lane i. 3896 } 3897 } break; 3898 3899 // See gather8 for general idea; comments here only where gather16 differs. 3900 case Op::gather16: { 3901 a->add (GP0, arg[immA], immB); 3902 a->ldrd(GP0, GP0); 3903 for (int i = 0; i < active_lanes; i++) { 3904 a->movs(GP1, r(x), i); 3905 a->add (GP1, GP0, GP1, A::LSL, 1); // Scale index 2x into a byte offset. 3906 a->ldrh(GP1, GP1); // 2-byte load. 3907 a->inss(dst(x), GP1, i); 3908 } 3909 } break; 3910 3911 // See gather8 for general idea; comments here only where gather32 differs. 3912 case Op::gather32: { 3913 a->add (GP0, arg[immA], immB); 3914 a->ldrd(GP0, GP0); 3915 for (int i = 0; i < active_lanes; i++) { 3916 a->movs(GP1, r(x), i); 3917 a->add (GP1, GP0, GP1, A::LSL, 2); // Scale index 4x into a byte offset. 3918 a->ldrs(GP1, GP1); // 4-byte load. 3919 a->inss(dst(x), GP1, i); 3920 } 3921 } break; 3922 3923 case Op::add_f32: a->fadd4s(dst(x,y), r(x), r(y)); break; 3924 case Op::sub_f32: a->fsub4s(dst(x,y), r(x), r(y)); break; 3925 case Op::mul_f32: a->fmul4s(dst(x,y), r(x), r(y)); break; 3926 case Op::div_f32: a->fdiv4s(dst(x,y), r(x), r(y)); break; 3927 3928 case Op::sqrt_f32: a->fsqrt4s(dst(x), r(x)); break; 3929 3930 case Op::fma_f32: // fmla.4s is z += x*y 3931 if (try_alias(z)) { a->fmla4s( r(z), r(x), r(y)); } 3932 else { a->orr16b(dst(), r(z), r(z)); 3933 a->fmla4s(dst(), r(x), r(y)); } 3934 break; 3935 3936 case Op::fnma_f32: // fmls.4s is z -= x*y 3937 if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); } 3938 else { a->orr16b(dst(), r(z), r(z)); 3939 a->fmls4s(dst(), r(x), r(y)); } 3940 break; 3941 3942 case Op::fms_f32: // calculate z - xy, then negate to xy - z 3943 if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); } 3944 else { a->orr16b(dst(), r(z), r(z)); 3945 a->fmls4s(dst(), r(x), r(y)); } 3946 a->fneg4s(dst(), dst()); 3947 break; 3948 3949 case Op:: gt_f32: a->fcmgt4s (dst(x,y), r(x), r(y)); break; 3950 case Op::gte_f32: a->fcmge4s (dst(x,y), r(x), r(y)); break; 3951 case Op:: eq_f32: a->fcmeq4s (dst(x,y), r(x), r(y)); break; 3952 case Op::neq_f32: a->fcmeq4s (dst(x,y), r(x), r(y)); 3953 a->not16b (dst(), dst()); break; 3954 3955 3956 case Op::add_i32: a->add4s(dst(x,y), r(x), r(y)); break; 3957 case Op::sub_i32: a->sub4s(dst(x,y), r(x), r(y)); break; 3958 case Op::mul_i32: a->mul4s(dst(x,y), r(x), r(y)); break; 3959 3960 case Op::bit_and : a->and16b(dst(x,y), r(x), r(y)); break; 3961 case Op::bit_or : a->orr16b(dst(x,y), r(x), r(y)); break; 3962 case Op::bit_xor : a->eor16b(dst(x,y), r(x), r(y)); break; 3963 case Op::bit_clear: a->bic16b(dst(x,y), r(x), r(y)); break; 3964 3965 case Op::select: // bsl16b is x = x ? y : z 3966 if (try_alias(x)) { a->bsl16b( r(x), r(y), r(z)); } 3967 else { a->orr16b(dst(), r(x), r(x)); 3968 a->bsl16b(dst(), r(y), r(z)); } 3969 break; 3970 3971 // fmin4s and fmax4s don't work the way we want with NaN, 3972 // so we write them the long way: 3973 case Op::min_f32: // min(x,y) = y<x ? y : x 3974 a->fcmgt4s(dst(), r(x), r(y)); 3975 a->bsl16b (dst(), r(y), r(x)); 3976 break; 3977 3978 case Op::max_f32: // max(x,y) = x<y ? y : x 3979 a->fcmgt4s(dst(), r(y), r(x)); 3980 a->bsl16b (dst(), r(y), r(x)); 3981 break; 3982 3983 case Op::shl_i32: a-> shl4s(dst(x), r(x), immA); break; 3984 case Op::shr_i32: a->ushr4s(dst(x), r(x), immA); break; 3985 case Op::sra_i32: a->sshr4s(dst(x), r(x), immA); break; 3986 3987 case Op::eq_i32: a->cmeq4s(dst(x,y), r(x), r(y)); break; 3988 case Op::gt_i32: a->cmgt4s(dst(x,y), r(x), r(y)); break; 3989 3990 case Op::to_f32: a->scvtf4s (dst(x), r(x)); break; 3991 case Op::trunc: a->fcvtzs4s(dst(x), r(x)); break; 3992 case Op::round: a->fcvtns4s(dst(x), r(x)); break; 3993 case Op::ceil: a->frintp4s(dst(x), r(x)); break; 3994 case Op::floor: a->frintm4s(dst(x), r(x)); break; 3995 3996 case Op::to_fp16: 3997 a->fcvtn (dst(x), r(x)); // 4x f32 -> 4x f16 in bottom four lanes 3998 a->uxtlh2s(dst(), dst()); // expand to 4x f16 in even 16-bit lanes 3999 break; 4000 4001 case Op::from_fp16: 4002 a->xtns2h(dst(x), r(x)); // pack even 16-bit lanes into bottom four lanes 4003 a->fcvtl (dst(), dst()); // 4x f16 -> 4x f32 4004 break; 4005 #endif 4006 } 4007 4008 // Proactively free the registers holding any value that dies here. 4009 if (rd != NA && dies_here(regs[rd])) { regs[rd] = NA; } 4010 if (rx != NA && regs[rx] != NA && dies_here(regs[rx])) { regs[rx] = NA; } 4011 if (ry != NA && regs[ry] != NA && dies_here(regs[ry])) { regs[ry] = NA; } 4012 if (rz != NA && regs[rz] != NA && dies_here(regs[rz])) { regs[rz] = NA; } 4013 if (rw != NA && regs[rw] != NA && dies_here(regs[rw])) { regs[rw] = NA; } 4014 return true; 4015 }; 4016 4017 #if defined(__x86_64__) || defined(_M_X64) 4018 auto jump_if_less = [&](A::Label* l) { a->jl (l); }; 4019 auto jump = [&](A::Label* l) { a->jmp(l); }; 4020 4021 auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); }; 4022 auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); }; 4023 #elif defined(__aarch64__) 4024 auto jump_if_less = [&](A::Label* l) { a->blt(l); }; 4025 auto jump = [&](A::Label* l) { a->b (l); }; 4026 4027 auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); }; 4028 auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); }; 4029 #endif 4030 4031 A::Label body, 4032 tail, 4033 done; 4034 4035 enter(); 4036 for (Val id = 0; id < (Val)instructions.size(); id++) { 4037 if (instructions[id].can_hoist && !emit(id, /*scalar=*/false)) { 4038 return false; 4039 } 4040 } 4041 4042 // This point marks a kind of canonical fixed point for register contents: if loop 4043 // code is generated as if these registers are holding these values, the next time 4044 // the loop comes around we'd better find those same registers holding those same values. 4045 auto restore_incoming_regs = [&,incoming=regs,saved_stack_slot=stack_slot, 4046 saved_next_stack_slot=next_stack_slot]{ 4047 for (int r = 0; r < (int)regs.size(); r++) { 4048 if (regs[r] != incoming[r]) { 4049 regs[r] = incoming[r]; 4050 if (regs[r] >= 0) { 4051 load_from_memory((Reg)r, regs[r]); 4052 } 4053 } 4054 } 4055 *stack_hint = std::max(*stack_hint, next_stack_slot); 4056 stack_slot = saved_stack_slot; 4057 next_stack_slot = saved_next_stack_slot; 4058 }; 4059 4060 a->label(&body); 4061 { 4062 a->cmp(N, K); 4063 jump_if_less(&tail); 4064 for (Val id = 0; id < (Val)instructions.size(); id++) { 4065 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/false)) { 4066 return false; 4067 } 4068 } 4069 restore_incoming_regs(); 4070 for (int i = 0; i < (int)fImpl->strides.size(); i++) { 4071 if (fImpl->strides[i]) { 4072 add(arg[i], K*fImpl->strides[i]); 4073 } 4074 } 4075 sub(N, K); 4076 jump(&body); 4077 } 4078 4079 a->label(&tail); 4080 { 4081 a->cmp(N, 1); 4082 jump_if_less(&done); 4083 for (Val id = 0; id < (Val)instructions.size(); id++) { 4084 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/true)) { 4085 return false; 4086 } 4087 } 4088 restore_incoming_regs(); 4089 for (int i = 0; i < (int)fImpl->strides.size(); i++) { 4090 if (fImpl->strides[i]) { 4091 add(arg[i], 1*fImpl->strides[i]); 4092 } 4093 } 4094 sub(N, 1); 4095 jump(&tail); 4096 } 4097 4098 a->label(&done); 4099 { 4100 exit(); 4101 } 4102 4103 // Except for explicit aligned load and store instructions, AVX allows 4104 // memory operands to be unaligned. So even though we're creating 16 4105 // byte patterns on ARM or 32-byte patterns on x86, we only need to 4106 // align to 4 bytes, the element size and alignment requirement. 4107 4108 constants.foreach([&](int imm, A::Label* label) { 4109 a->align(4); 4110 a->label(label); 4111 for (int i = 0; i < K; i++) { 4112 a->word(imm); 4113 } 4114 }); 4115 4116 if (!iota.references.empty()) { 4117 a->align(4); 4118 a->label(&iota); // 0,1,2,3,4,... 4119 for (int i = 0; i < K; i++) { 4120 a->word(i); 4121 } 4122 } 4123 4124 if (!load64_index.references.empty()) { 4125 a->align(4); 4126 a->label(&load64_index); // {0,2,4,6|1,3,5,7} 4127 a->word(0); a->word(2); a->word(4); a->word(6); 4128 a->word(1); a->word(3); a->word(5); a->word(7); 4129 } 4130 4131 return true; 4132 } 4133 setupJIT(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)4134 void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions, 4135 const char* debug_name) { 4136 // Assemble with no buffer to determine a.size() (the number of bytes we'll assemble) 4137 // and stack_hint/registers_used to feed forward into the next jit() call. 4138 Assembler a{nullptr}; 4139 int stack_hint = -1; 4140 uint32_t registers_used = 0xffff'ffff; // Start conservatively with all. 4141 if (!this->jit(instructions, &stack_hint, ®isters_used, &a)) { 4142 return; 4143 } 4144 4145 fImpl->jit_size = a.size(); 4146 void* jit_entry = alloc_jit_buffer(&fImpl->jit_size); 4147 fImpl->jit_entry.store(jit_entry); 4148 4149 // Assemble the program for real with stack_hint/registers_used as feedback from first call. 4150 a = Assembler{jit_entry}; 4151 SkAssertResult(this->jit(instructions, &stack_hint, ®isters_used, &a)); 4152 SkASSERT(a.size() <= fImpl->jit_size); 4153 4154 // Remap as executable, and flush caches on platforms that need that. 4155 remap_as_executable(jit_entry, fImpl->jit_size); 4156 4157 notify_vtune(debug_name, jit_entry, fImpl->jit_size); 4158 4159 #if !defined(SK_BUILD_FOR_WIN) 4160 // For profiling and debugging, it's helpful to have this code loaded 4161 // dynamically rather than just jumping info fImpl->jit_entry. 4162 if (gSkVMJITViaDylib) { 4163 // Dump the raw program binary. 4164 SkString path = SkStringPrintf("/tmp/%s.XXXXXX", debug_name); 4165 int fd = mkstemp(path.writable_str()); 4166 ::write(fd, jit_entry, a.size()); 4167 close(fd); 4168 4169 this->dropJIT(); // (unmap and null out fImpl->jit_entry.) 4170 4171 // Convert it in-place to a dynamic library with a single symbol "skvm_jit": 4172 SkString cmd = SkStringPrintf( 4173 "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'" 4174 " | clang -x assembler -shared - -o %s", 4175 path.c_str(), path.c_str()); 4176 system(cmd.c_str()); 4177 4178 // Load that dynamic library and look up skvm_jit(). 4179 fImpl->dylib = dlopen(path.c_str(), RTLD_NOW|RTLD_LOCAL); 4180 void* sym = nullptr; 4181 for (const char* name : {"skvm_jit", "_skvm_jit"} ) { 4182 if (!sym) { sym = dlsym(fImpl->dylib, name); } 4183 } 4184 fImpl->jit_entry.store(sym); 4185 } 4186 #endif 4187 } 4188 #endif 4189 4190 } // namespace skvm 4191