1 /* 2 * Copyright 2019 Google LLC 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "include/core/SkStream.h" 9 #include "include/core/SkString.h" 10 #include "include/private/SkHalf.h" 11 #include "include/private/SkTFitsIn.h" 12 #include "include/private/SkThreadID.h" 13 #include "src/core/SkColorSpacePriv.h" 14 #include "src/core/SkColorSpaceXformSteps.h" 15 #include "src/core/SkCpu.h" 16 #include "src/core/SkEnumerate.h" 17 #include "src/core/SkOpts.h" 18 #include "src/core/SkStreamPriv.h" 19 #include "src/core/SkVM.h" 20 #include "src/utils/SkVMVisualizer.h" 21 #include <algorithm> 22 #include <atomic> 23 #include <queue> 24 25 #if defined(SKVM_LLVM) 26 #include <future> 27 #include <llvm/Bitcode/BitcodeWriter.h> 28 #include <llvm/ExecutionEngine/ExecutionEngine.h> 29 #include <llvm/IR/IRBuilder.h> 30 #include <llvm/IR/Verifier.h> 31 #include <llvm/Support/TargetSelect.h> 32 #include <llvm/Support/Host.h> 33 34 // Platform-specific intrinsics got their own files in LLVM 10. 35 #if __has_include(<llvm/IR/IntrinsicsX86.h>) 36 #include <llvm/IR/IntrinsicsX86.h> 37 #endif 38 #endif 39 40 #if !defined(SK_BUILD_FOR_WIN) 41 #include <unistd.h> 42 #endif 43 44 // #define SKVM_LLVM_WAIT_FOR_COMPILATION 45 46 bool gSkVMAllowJIT{false}; 47 bool gSkVMJITViaDylib{false}; 48 49 #if defined(SKVM_JIT) 50 #if defined(SK_BUILD_FOR_WIN) 51 #include "src/core/SkLeanWindows.h" 52 #include <memoryapi.h> 53 alloc_jit_buffer(size_t * len)54 static void* alloc_jit_buffer(size_t* len) { 55 return VirtualAlloc(NULL, *len, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); 56 } remap_as_executable(void * ptr,size_t len)57 static void remap_as_executable(void* ptr, size_t len) { 58 DWORD old; 59 VirtualProtect(ptr, len, PAGE_EXECUTE_READ, &old); 60 SkASSERT(old == PAGE_READWRITE); 61 } 62 #if !defined(SKVM_LLVM) unmap_jit_buffer(void * ptr,size_t len)63 static void unmap_jit_buffer(void* ptr, size_t len) { 64 VirtualFree(ptr, 0, MEM_RELEASE); 65 } close_dylib(void * dylib)66 static void close_dylib(void* dylib) { 67 SkASSERT(false); // TODO? For now just assert we never make one. 68 } 69 #endif 70 #else 71 #include <dlfcn.h> 72 #include <sys/mman.h> 73 alloc_jit_buffer(size_t * len)74 static void* alloc_jit_buffer(size_t* len) { 75 // While mprotect and VirtualAlloc both work at page granularity, 76 // mprotect doesn't round up for you, and instead requires *len is at page granularity. 77 const size_t page = sysconf(_SC_PAGESIZE); 78 *len = ((*len + page - 1) / page) * page; 79 return mmap(nullptr,*len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0); 80 } remap_as_executable(void * ptr,size_t len)81 static void remap_as_executable(void* ptr, size_t len) { 82 mprotect(ptr, len, PROT_READ|PROT_EXEC); 83 __builtin___clear_cache((char*)ptr, 84 (char*)ptr + len); 85 } 86 #if !defined(SKVM_LLVM) unmap_jit_buffer(void * ptr,size_t len)87 static void unmap_jit_buffer(void* ptr, size_t len) { 88 munmap(ptr, len); 89 } close_dylib(void * dylib)90 static void close_dylib(void* dylib) { 91 dlclose(dylib); 92 } 93 #endif 94 #endif 95 96 #if defined(SKVM_JIT_VTUNE) 97 #include <jitprofiling.h> notify_vtune(const char * name,void * addr,size_t len)98 static void notify_vtune(const char* name, void* addr, size_t len) { 99 if (iJIT_IsProfilingActive() == iJIT_SAMPLING_ON) { 100 iJIT_Method_Load event; 101 memset(&event, 0, sizeof(event)); 102 event.method_id = iJIT_GetNewMethodID(); 103 event.method_name = const_cast<char*>(name); 104 event.method_load_address = addr; 105 event.method_size = len; 106 iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &event); 107 } 108 } 109 #else notify_vtune(const char * name,void * addr,size_t len)110 static void notify_vtune(const char* name, void* addr, size_t len) {} 111 #endif 112 #endif 113 114 // JIT code isn't MSAN-instrumented, so we won't see when it uses 115 // uninitialized memory, and we'll not see the writes it makes as properly 116 // initializing memory. Instead force the interpreter, which should let 117 // MSAN see everything our programs do properly. 118 // 119 // Similarly, we can't get ASAN's checks unless we let it instrument our interpreter. 120 #if defined(__has_feature) 121 #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer) 122 #define SKVM_JIT_BUT_IGNORE_IT 123 #endif 124 #endif 125 126 #if defined(SKSL_STANDALONE) 127 // skslc needs to link against this module (for the VM code generator). This module pulls in 128 // color-space code, but attempting to add those transitive dependencies to skslc gets out of 129 // hand. So we terminate the chain here with stub functions. Note that skslc's usage of SkVM 130 // never cares about color management. sk_program_transfer_fn(skvm::F32 v,TFKind tf_kind,skvm::F32 G,skvm::F32 A,skvm::F32 B,skvm::F32 C,skvm::F32 D,skvm::F32 E,skvm::F32 F)131 skvm::F32 sk_program_transfer_fn( 132 skvm::F32 v, TFKind tf_kind, 133 skvm::F32 G, skvm::F32 A, skvm::F32 B, skvm::F32 C, skvm::F32 D, skvm::F32 E, skvm::F32 F) { 134 return v; 135 } 136 skcms_sRGB_TransferFunction()137 const skcms_TransferFunction* skcms_sRGB_TransferFunction() { return nullptr; } skcms_sRGB_Inverse_TransferFunction()138 const skcms_TransferFunction* skcms_sRGB_Inverse_TransferFunction() { return nullptr; } 139 #endif 140 141 namespace skvm { 142 detect_features()143 static Features detect_features() { 144 static const bool fma = 145 #if defined(SK_CPU_X86) 146 SkCpu::Supports(SkCpu::HSW); 147 #elif defined(SK_CPU_ARM64) 148 true; 149 #else 150 false; 151 #endif 152 153 static const bool fp16 = false; // TODO 154 155 return { fma, fp16 }; 156 } 157 Builder(bool createDuplicates)158 Builder::Builder(bool createDuplicates) 159 : fFeatures(detect_features()), fCreateDuplicates(createDuplicates) {} Builder(Features features,bool createDuplicates)160 Builder::Builder(Features features, bool createDuplicates) 161 : fFeatures(features ), fCreateDuplicates(createDuplicates) {} 162 163 struct Program::Impl { 164 std::vector<InterpreterInstruction> instructions; 165 int regs = 0; 166 int loop = 0; 167 std::vector<int> strides; 168 std::vector<TraceHook*> traceHooks; 169 std::unique_ptr<viz::Visualizer> visualizer; 170 171 std::atomic<void*> jit_entry{nullptr}; // TODO: minimal std::memory_orders 172 size_t jit_size = 0; 173 void* dylib = nullptr; 174 175 #if defined(SKVM_LLVM) 176 std::unique_ptr<llvm::LLVMContext> llvm_ctx; 177 std::unique_ptr<llvm::ExecutionEngine> llvm_ee; 178 std::future<void> llvm_compiling; 179 #endif 180 }; 181 182 // Debugging tools, mostly for printing various data structures out to a stream. 183 184 namespace { 185 struct V { Val id; }; 186 struct R { Reg id; }; 187 struct Shift { int bits; }; 188 struct Splat { int bits; }; 189 struct Hex { int bits; }; 190 struct TraceHookID { int bits; }; 191 // For op `trace_line` 192 struct Line { int bits; }; 193 // For op `trace_var` 194 struct VarSlot { int bits; }; 195 // For op `trace_enter`/`trace_exit` 196 struct FnIdx { int bits; }; 197 write(SkWStream * o,const char * s)198 static void write(SkWStream* o, const char* s) { 199 o->writeText(s); 200 } 201 name(Op op)202 static const char* name(Op op) { 203 switch (op) { 204 #define M(x) case Op::x: return #x; 205 SKVM_OPS(M) 206 #undef M 207 } 208 return "unknown op"; 209 } 210 write(SkWStream * o,Op op)211 static void write(SkWStream* o, Op op) { 212 o->writeText(name(op)); 213 } write(SkWStream * o,Ptr p)214 static void write(SkWStream* o, Ptr p) { 215 write(o, "ptr"); 216 o->writeDecAsText(p.ix); 217 } write(SkWStream * o,V v)218 static void write(SkWStream* o, V v) { 219 write(o, "v"); 220 o->writeDecAsText(v.id); 221 } write(SkWStream * o,R r)222 static void write(SkWStream* o, R r) { 223 write(o, "r"); 224 o->writeDecAsText(r.id); 225 } write(SkWStream * o,Shift s)226 static void write(SkWStream* o, Shift s) { 227 o->writeDecAsText(s.bits); 228 } write(SkWStream * o,Splat s)229 static void write(SkWStream* o, Splat s) { 230 float f; 231 memcpy(&f, &s.bits, 4); 232 o->writeHexAsText(s.bits); 233 write(o, " ("); 234 o->writeScalarAsText(f); 235 write(o, ")"); 236 } write(SkWStream * o,Hex h)237 static void write(SkWStream* o, Hex h) { 238 o->writeHexAsText(h.bits); 239 } write(SkWStream * o,TraceHookID h)240 static void write(SkWStream* o, TraceHookID h) { 241 o->writeDecAsText(h.bits); 242 } write(SkWStream * o,Line d)243 static void write(SkWStream* o, Line d) { 244 write(o, "L"); 245 o->writeDecAsText(d.bits); 246 } write(SkWStream * o,VarSlot s)247 static void write(SkWStream* o, VarSlot s) { 248 write(o, "$"); 249 o->writeDecAsText(s.bits); 250 } write(SkWStream * o,FnIdx s)251 static void write(SkWStream* o, FnIdx s) { 252 write(o, "F"); 253 o->writeDecAsText(s.bits); 254 } 255 template <typename T, typename... Ts> write(SkWStream * o,T first,Ts...rest)256 static void write(SkWStream* o, T first, Ts... rest) { 257 write(o, first); 258 write(o, " "); 259 write(o, rest...); 260 } 261 } // namespace 262 write_one_instruction(Val id,const OptimizedInstruction & inst,SkWStream * o)263 static void write_one_instruction(Val id, const OptimizedInstruction& inst, SkWStream* o) { 264 Op op = inst.op; 265 Val x = inst.x, 266 y = inst.y, 267 z = inst.z, 268 w = inst.w; 269 int immA = inst.immA, 270 immB = inst.immB, 271 immC = inst.immC; 272 switch (op) { 273 case Op::assert_true: write(o, op, V{x}, V{y}); break; 274 275 case Op::trace_line: write(o, op, TraceHookID{immA}, V{x}, V{y}, Line{immB}); break; 276 case Op::trace_var: write(o, op, TraceHookID{immA}, V{x}, V{y}, 277 VarSlot{immB}, "=", V{z}); break; 278 case Op::trace_enter: write(o, op, TraceHookID{immA}, V{x}, V{y}, FnIdx{immB}); break; 279 case Op::trace_exit: write(o, op, TraceHookID{immA}, V{x}, V{y}, FnIdx{immB}); break; 280 case Op::trace_scope: write(o, op, TraceHookID{immA}, V{x}, V{y}, Shift{immB}); break; 281 282 case Op::store8: write(o, op, Ptr{immA}, V{x} ); break; 283 case Op::store16: write(o, op, Ptr{immA}, V{x} ); break; 284 case Op::store32: write(o, op, Ptr{immA}, V{x} ); break; 285 case Op::store64: write(o, op, Ptr{immA}, V{x},V{y} ); break; 286 case Op::store128: write(o, op, Ptr{immA}, V{x},V{y},V{z},V{w}); break; 287 288 case Op::index: write(o, V{id}, "=", op); break; 289 290 case Op::load8: write(o, V{id}, "=", op, Ptr{immA}); break; 291 case Op::load16: write(o, V{id}, "=", op, Ptr{immA}); break; 292 case Op::load32: write(o, V{id}, "=", op, Ptr{immA}); break; 293 case Op::load64: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break; 294 case Op::load128: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break; 295 296 case Op::gather8: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break; 297 case Op::gather16: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break; 298 case Op::gather32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break; 299 300 case Op::uniform32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break; 301 case Op::array32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break; 302 303 case Op::splat: write(o, V{id}, "=", op, Splat{immA}); break; 304 305 case Op:: add_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 306 case Op:: sub_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 307 case Op:: mul_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 308 case Op:: div_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 309 case Op:: min_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 310 case Op:: max_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 311 case Op:: fma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 312 case Op:: fms_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 313 case Op::fnma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 314 315 316 case Op::sqrt_f32: write(o, V{id}, "=", op, V{x}); break; 317 318 case Op:: eq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 319 case Op::neq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 320 case Op:: gt_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 321 case Op::gte_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 322 323 324 case Op::add_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 325 case Op::sub_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 326 case Op::mul_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 327 328 case Op::shl_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break; 329 case Op::shr_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break; 330 case Op::sra_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break; 331 332 case Op::eq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 333 case Op::gt_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 334 335 336 case Op::bit_and : write(o, V{id}, "=", op, V{x}, V{y}); break; 337 case Op::bit_or : write(o, V{id}, "=", op, V{x}, V{y}); break; 338 case Op::bit_xor : write(o, V{id}, "=", op, V{x}, V{y}); break; 339 case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y}); break; 340 341 case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 342 343 case Op::ceil: write(o, V{id}, "=", op, V{x}); break; 344 case Op::floor: write(o, V{id}, "=", op, V{x}); break; 345 case Op::to_f32: write(o, V{id}, "=", op, V{x}); break; 346 case Op::to_fp16: write(o, V{id}, "=", op, V{x}); break; 347 case Op::from_fp16: write(o, V{id}, "=", op, V{x}); break; 348 case Op::trunc: write(o, V{id}, "=", op, V{x}); break; 349 case Op::round: write(o, V{id}, "=", op, V{x}); break; 350 351 case Op::duplicate: write(o, V{id}, "=", op, Hex{immA}); break; 352 } 353 354 write(o, "\n"); 355 } 356 dump(SkWStream * o) const357 void Builder::dump(SkWStream* o) const { 358 SkDebugfStream debug; 359 if (!o) { o = &debug; } 360 361 std::vector<OptimizedInstruction> optimized = this->optimize(); 362 o->writeDecAsText(optimized.size()); 363 o->writeText(" values (originally "); 364 o->writeDecAsText(fProgram.size()); 365 o->writeText("):\n"); 366 for (Val id = 0; id < (Val)optimized.size(); id++) { 367 const OptimizedInstruction& inst = optimized[id]; 368 write(o, inst.can_hoist ? "↑ " : " "); 369 write_one_instruction(id, inst, o); 370 } 371 } 372 visualize(SkWStream * output,const char * code) const373 void Program::visualize(SkWStream* output, const char* code) const { 374 if (fImpl->visualizer) { 375 fImpl->visualizer->dump(output, code); 376 } 377 } 378 visualizer()379 viz::Visualizer* Program::visualizer() { return fImpl->visualizer.get(); } dump(SkWStream * o) const380 void Program::dump(SkWStream* o) const { 381 SkDebugfStream debug; 382 if (!o) { o = &debug; } 383 384 o->writeDecAsText(fImpl->regs); 385 o->writeText(" registers, "); 386 o->writeDecAsText(fImpl->instructions.size()); 387 o->writeText(" instructions:\n"); 388 for (Val i = 0; i < (Val)fImpl->instructions.size(); i++) { 389 if (i == fImpl->loop) { write(o, "loop:\n"); } 390 o->writeDecAsText(i); 391 o->writeText("\t"); 392 if (i >= fImpl->loop) { write(o, " "); } 393 const InterpreterInstruction& inst = fImpl->instructions[i]; 394 Op op = inst.op; 395 Reg d = inst.d, 396 x = inst.x, 397 y = inst.y, 398 z = inst.z, 399 w = inst.w; 400 int immA = inst.immA, 401 immB = inst.immB, 402 immC = inst.immC; 403 switch (op) { 404 case Op::assert_true: write(o, op, R{x}, R{y}); break; 405 406 case Op::trace_line: write(o, op, TraceHookID{immA}, 407 R{x}, R{y}, Line{immB}); break; 408 case Op::trace_var: write(o, op, TraceHookID{immA}, R{x}, R{y}, 409 VarSlot{immB}, "=", R{z}); break; 410 case Op::trace_enter: write(o, op, TraceHookID{immA}, 411 R{x}, R{y}, FnIdx{immB}); break; 412 case Op::trace_exit: write(o, op, TraceHookID{immA}, 413 R{x}, R{y}, FnIdx{immB}); break; 414 case Op::trace_scope: write(o, op, TraceHookID{immA}, 415 R{x}, R{y}, Shift{immB}); break; 416 417 case Op::store8: write(o, op, Ptr{immA}, R{x} ); break; 418 case Op::store16: write(o, op, Ptr{immA}, R{x} ); break; 419 case Op::store32: write(o, op, Ptr{immA}, R{x} ); break; 420 case Op::store64: write(o, op, Ptr{immA}, R{x}, R{y} ); break; 421 case Op::store128: write(o, op, Ptr{immA}, R{x}, R{y}, R{z}, R{w}); break; 422 423 case Op::index: write(o, R{d}, "=", op); break; 424 425 case Op::load8: write(o, R{d}, "=", op, Ptr{immA}); break; 426 case Op::load16: write(o, R{d}, "=", op, Ptr{immA}); break; 427 case Op::load32: write(o, R{d}, "=", op, Ptr{immA}); break; 428 case Op::load64: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break; 429 case Op::load128: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break; 430 431 case Op::gather8: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break; 432 case Op::gather16: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break; 433 case Op::gather32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break; 434 435 case Op::uniform32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break; 436 case Op::array32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break; 437 438 case Op::splat: write(o, R{d}, "=", op, Splat{immA}); break; 439 440 case Op::add_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 441 case Op::sub_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 442 case Op::mul_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 443 case Op::div_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 444 case Op::min_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 445 case Op::max_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 446 case Op::fma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 447 case Op::fms_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 448 case Op::fnma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 449 450 case Op::sqrt_f32: write(o, R{d}, "=", op, R{x}); break; 451 452 case Op:: eq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 453 case Op::neq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 454 case Op:: gt_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 455 case Op::gte_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 456 457 458 case Op::add_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 459 case Op::sub_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 460 case Op::mul_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 461 462 case Op::shl_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break; 463 case Op::shr_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break; 464 case Op::sra_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break; 465 466 case Op::eq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 467 case Op::gt_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 468 469 case Op::bit_and : write(o, R{d}, "=", op, R{x}, R{y}); break; 470 case Op::bit_or : write(o, R{d}, "=", op, R{x}, R{y}); break; 471 case Op::bit_xor : write(o, R{d}, "=", op, R{x}, R{y}); break; 472 case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y}); break; 473 474 case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 475 476 case Op::ceil: write(o, R{d}, "=", op, R{x}); break; 477 case Op::floor: write(o, R{d}, "=", op, R{x}); break; 478 case Op::to_f32: write(o, R{d}, "=", op, R{x}); break; 479 case Op::to_fp16: write(o, R{d}, "=", op, R{x}); break; 480 case Op::from_fp16: write(o, R{d}, "=", op, R{x}); break; 481 case Op::trunc: write(o, R{d}, "=", op, R{x}); break; 482 case Op::round: write(o, R{d}, "=", op, R{x}); break; 483 484 case Op::duplicate: write(o, R{d}, "=", op, Hex{immA}); break; 485 } 486 write(o, "\n"); 487 } 488 } eliminate_dead_code(std::vector<Instruction> program,viz::Visualizer * visualizer)489 std::vector<Instruction> eliminate_dead_code(std::vector<Instruction> program, 490 viz::Visualizer* visualizer) { 491 // Determine which Instructions are live by working back from side effects. 492 std::vector<bool> live(program.size(), false); 493 for (Val id = program.size(); id--;) { 494 if (live[id] || has_side_effect(program[id].op)) { 495 live[id] = true; 496 const Instruction& inst = program[id]; 497 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) { 498 if (arg != NA) { live[arg] = true; } 499 } 500 } 501 } 502 503 // Rewrite the program with only live Instructions: 504 // - remap IDs in live Instructions to what they'll be once dead Instructions are removed; 505 // - then actually remove the dead Instructions. 506 std::vector<Val> new_id(program.size(), NA); 507 for (Val id = 0, next = 0; id < (Val)program.size(); id++) { 508 if (live[id]) { 509 Instruction& inst = program[id]; 510 for (Val* arg : {&inst.x, &inst.y, &inst.z, &inst.w}) { 511 if (*arg != NA) { 512 *arg = new_id[*arg]; 513 SkASSERT(*arg != NA); 514 } 515 } 516 new_id[id] = next++; 517 } 518 } 519 520 if (visualizer) { 521 visualizer->addInstructions(program); 522 visualizer->markAsDeadCode(live, new_id); 523 } 524 525 // Eliminate any non-live ops. 526 auto it = std::remove_if(program.begin(), program.end(), [&](const Instruction& inst) { 527 Val id = (Val)(&inst - program.data()); 528 return !live[id]; 529 }); 530 program.erase(it, program.end()); 531 532 return program; 533 } 534 finalize(const std::vector<Instruction> program,viz::Visualizer * visualizer)535 std::vector<OptimizedInstruction> finalize(const std::vector<Instruction> program, 536 viz::Visualizer* visualizer) { 537 std::vector<OptimizedInstruction> optimized(program.size()); 538 for (Val id = 0; id < (Val)program.size(); id++) { 539 Instruction inst = program[id]; 540 optimized[id] = {inst.op, inst.x,inst.y,inst.z,inst.w, 541 inst.immA,inst.immB,inst.immC, 542 /*death=*/id, /*can_hoist=*/true}; 543 } 544 545 // Each Instruction's inputs need to live at least until that Instruction issues. 546 for (Val id = 0; id < (Val)optimized.size(); id++) { 547 OptimizedInstruction& inst = optimized[id]; 548 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) { 549 // (We're walking in order, so this is the same as max()ing with the existing Val.) 550 if (arg != NA) { optimized[arg].death = id; } 551 } 552 } 553 554 // Mark which values don't depend on the loop and can be hoisted. 555 for (OptimizedInstruction& inst : optimized) { 556 // Varying loads (and gathers) and stores cannot be hoisted out of the loop. 557 if (is_always_varying(inst.op) || is_trace(inst.op)) { 558 inst.can_hoist = false; 559 } 560 561 // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself. 562 if (inst.can_hoist) { 563 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) { 564 if (arg != NA) { inst.can_hoist &= optimized[arg].can_hoist; } 565 } 566 } 567 } 568 569 // Extend the lifetime of any hoisted value that's used in the loop to infinity. 570 for (OptimizedInstruction& inst : optimized) { 571 if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used-in-loop*/) { 572 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) { 573 if (arg != NA && optimized[arg].can_hoist) { 574 optimized[arg].death = (Val)program.size(); 575 } 576 } 577 } 578 } 579 580 if (visualizer) { 581 visualizer->finalize(program, optimized); 582 } 583 584 return optimized; 585 } 586 optimize(viz::Visualizer * visualizer) const587 std::vector<OptimizedInstruction> Builder::optimize(viz::Visualizer* visualizer) const { 588 std::vector<Instruction> program = this->program(); 589 program = eliminate_dead_code(std::move(program), visualizer); 590 return finalize (std::move(program), visualizer); 591 } 592 done(const char * debug_name,bool allow_jit) const593 Program Builder::done(const char* debug_name, 594 bool allow_jit) const { 595 return this->done(debug_name, allow_jit, /*visualizer=*/nullptr); 596 } 597 done(const char * debug_name,bool allow_jit,std::unique_ptr<viz::Visualizer> visualizer) const598 Program Builder::done(const char* debug_name, 599 bool allow_jit, 600 std::unique_ptr<viz::Visualizer> visualizer) const { 601 char buf[64] = "skvm-jit-"; 602 if (!debug_name) { 603 *SkStrAppendU32(buf+9, this->hash()) = '\0'; 604 debug_name = buf; 605 } 606 607 auto optimized = this->optimize(visualizer ? visualizer.get() : nullptr); 608 return {optimized, 609 std::move(visualizer), 610 fStrides, 611 fTraceHooks, debug_name, allow_jit}; 612 } 613 hash() const614 uint64_t Builder::hash() const { 615 uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 0), 616 hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 1); 617 return (uint64_t)lo | (uint64_t)hi << 32; 618 } 619 operator !=(Ptr a,Ptr b)620 bool operator!=(Ptr a, Ptr b) { return a.ix != b.ix; } 621 operator ==(const Instruction & a,const Instruction & b)622 bool operator==(const Instruction& a, const Instruction& b) { 623 return a.op == b.op 624 && a.x == b.x 625 && a.y == b.y 626 && a.z == b.z 627 && a.w == b.w 628 && a.immA == b.immA 629 && a.immB == b.immB 630 && a.immC == b.immC; 631 } 632 operator ()(const Instruction & inst,uint32_t seed) const633 uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const { 634 return SkOpts::hash(&inst, sizeof(inst), seed); 635 } 636 637 638 // Most instructions produce a value and return it by ID, 639 // the value-producing instruction's own index in the program vector. push(Instruction inst)640 Val Builder::push(Instruction inst) { 641 // Basic common subexpression elimination: 642 // if we've already seen this exact Instruction, use it instead of creating a new one. 643 // 644 // But we never dedup loads or stores: an intervening store could change that memory. 645 // Uniforms and gathers touch only uniform memory, so they're fine to dedup, 646 // and index is varying but doesn't touch memory, so it's fine to dedup too. 647 if (!touches_varying_memory(inst.op) && !is_trace(inst.op)) { 648 if (Val* id = fIndex.find(inst)) { 649 if (fCreateDuplicates) { 650 inst.op = Op::duplicate; 651 inst.immA = *id; 652 fProgram.push_back(inst); 653 } 654 return *id; 655 } 656 } 657 658 Val id = static_cast<Val>(fProgram.size()); 659 fProgram.push_back(inst); 660 fIndex.set(inst, id); 661 return id; 662 } 663 arg(int stride)664 Ptr Builder::arg(int stride) { 665 int ix = (int)fStrides.size(); 666 fStrides.push_back(stride); 667 return {ix}; 668 } 669 assert_true(I32 cond,I32 debug)670 void Builder::assert_true(I32 cond, I32 debug) { 671 #ifdef SK_DEBUG 672 int imm; 673 if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; } 674 (void)push(Op::assert_true, cond.id, debug.id); 675 #endif 676 } 677 attachTraceHook(TraceHook * hook)678 int Builder::attachTraceHook(TraceHook* hook) { 679 int traceHookID = (int)fTraceHooks.size(); 680 fTraceHooks.push_back(hook); 681 return traceHookID; 682 } 683 mergeMasks(I32 & mask,I32 & traceMask)684 bool Builder::mergeMasks(I32& mask, I32& traceMask) { 685 if (this->isImm(mask.id, 0)) { return false; } 686 if (this->isImm(traceMask.id, 0)) { return false; } 687 if (this->isImm(mask.id, ~0)) { mask = traceMask; } 688 if (this->isImm(traceMask.id,~0)) { traceMask = mask; } 689 return true; 690 } 691 trace_line(int traceHookID,I32 mask,I32 traceMask,int line)692 void Builder::trace_line(int traceHookID, I32 mask, I32 traceMask, int line) { 693 SkASSERT(traceHookID >= 0); 694 SkASSERT(traceHookID < (int)fTraceHooks.size()); 695 if (!this->mergeMasks(mask, traceMask)) { return; } 696 (void)push(Op::trace_line, mask.id,traceMask.id,NA,NA, traceHookID, line); 697 } trace_var(int traceHookID,I32 mask,I32 traceMask,int slot,I32 val)698 void Builder::trace_var(int traceHookID, I32 mask, I32 traceMask, int slot, I32 val) { 699 SkASSERT(traceHookID >= 0); 700 SkASSERT(traceHookID < (int)fTraceHooks.size()); 701 if (!this->mergeMasks(mask, traceMask)) { return; } 702 (void)push(Op::trace_var, mask.id,traceMask.id,val.id,NA, traceHookID, slot); 703 } trace_enter(int traceHookID,I32 mask,I32 traceMask,int fnIdx)704 void Builder::trace_enter(int traceHookID, I32 mask, I32 traceMask, int fnIdx) { 705 SkASSERT(traceHookID >= 0); 706 SkASSERT(traceHookID < (int)fTraceHooks.size()); 707 if (!this->mergeMasks(mask, traceMask)) { return; } 708 (void)push(Op::trace_enter, mask.id,traceMask.id,NA,NA, traceHookID, fnIdx); 709 } trace_exit(int traceHookID,I32 mask,I32 traceMask,int fnIdx)710 void Builder::trace_exit(int traceHookID, I32 mask, I32 traceMask, int fnIdx) { 711 SkASSERT(traceHookID >= 0); 712 SkASSERT(traceHookID < (int)fTraceHooks.size()); 713 if (!this->mergeMasks(mask, traceMask)) { return; } 714 (void)push(Op::trace_exit, mask.id,traceMask.id,NA,NA, traceHookID, fnIdx); 715 } trace_scope(int traceHookID,I32 mask,I32 traceMask,int delta)716 void Builder::trace_scope(int traceHookID, I32 mask, I32 traceMask, int delta) { 717 SkASSERT(traceHookID >= 0); 718 SkASSERT(traceHookID < (int)fTraceHooks.size()); 719 if (!this->mergeMasks(mask, traceMask)) { return; } 720 (void)push(Op::trace_scope, mask.id,traceMask.id,NA,NA, traceHookID, delta); 721 } 722 store8(Ptr ptr,I32 val)723 void Builder::store8 (Ptr ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA,NA, ptr.ix); } store16(Ptr ptr,I32 val)724 void Builder::store16(Ptr ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA,NA, ptr.ix); } store32(Ptr ptr,I32 val)725 void Builder::store32(Ptr ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA,NA, ptr.ix); } store64(Ptr ptr,I32 lo,I32 hi)726 void Builder::store64(Ptr ptr, I32 lo, I32 hi) { 727 (void)push(Op::store64, lo.id,hi.id,NA,NA, ptr.ix); 728 } store128(Ptr ptr,I32 x,I32 y,I32 z,I32 w)729 void Builder::store128(Ptr ptr, I32 x, I32 y, I32 z, I32 w) { 730 (void)push(Op::store128, x.id,y.id,z.id,w.id, ptr.ix); 731 } 732 index()733 I32 Builder::index() { return {this, push(Op::index)}; } 734 load8(Ptr ptr)735 I32 Builder::load8 (Ptr ptr) { return {this, push(Op::load8 , NA,NA,NA,NA, ptr.ix) }; } load16(Ptr ptr)736 I32 Builder::load16(Ptr ptr) { return {this, push(Op::load16, NA,NA,NA,NA, ptr.ix) }; } load32(Ptr ptr)737 I32 Builder::load32(Ptr ptr) { return {this, push(Op::load32, NA,NA,NA,NA, ptr.ix) }; } load64(Ptr ptr,int lane)738 I32 Builder::load64(Ptr ptr, int lane) { 739 return {this, push(Op::load64 , NA,NA,NA,NA, ptr.ix,lane) }; 740 } load128(Ptr ptr,int lane)741 I32 Builder::load128(Ptr ptr, int lane) { 742 return {this, push(Op::load128, NA,NA,NA,NA, ptr.ix,lane) }; 743 } 744 gather8(UPtr ptr,int offset,I32 index)745 I32 Builder::gather8 (UPtr ptr, int offset, I32 index) { 746 return {this, push(Op::gather8 , index.id,NA,NA,NA, ptr.ix,offset)}; 747 } gather16(UPtr ptr,int offset,I32 index)748 I32 Builder::gather16(UPtr ptr, int offset, I32 index) { 749 return {this, push(Op::gather16, index.id,NA,NA,NA, ptr.ix,offset)}; 750 } gather32(UPtr ptr,int offset,I32 index)751 I32 Builder::gather32(UPtr ptr, int offset, I32 index) { 752 return {this, push(Op::gather32, index.id,NA,NA,NA, ptr.ix,offset)}; 753 } 754 uniform32(UPtr ptr,int offset)755 I32 Builder::uniform32(UPtr ptr, int offset) { 756 return {this, push(Op::uniform32, NA,NA,NA,NA, ptr.ix, offset)}; 757 } 758 759 // Note: this converts the array index into a byte offset for the op. array32(UPtr ptr,int offset,int index)760 I32 Builder::array32 (UPtr ptr, int offset, int index) { 761 return {this, push(Op::array32, NA,NA,NA,NA, ptr.ix, offset, index * sizeof(int))}; 762 } 763 splat(int n)764 I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA,NA, n) }; } 765 766 // Be careful peepholing float math! Transformations you might expect to 767 // be legal can fail in the face of NaN/Inf, e.g. 0*x is not always 0. 768 // Float peepholes must pass this equivalence test for all ~4B floats: 769 // 770 // bool equiv(float x, float y) { return (x == y) || (isnanf(x) && isnanf(y)); } 771 // 772 // unsigned bits = 0; 773 // do { 774 // float f; 775 // memcpy(&f, &bits, 4); 776 // if (!equiv(f, ...)) { 777 // abort(); 778 // } 779 // } while (++bits != 0); 780 add(F32 x,F32 y)781 F32 Builder::add(F32 x, F32 y) { 782 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); } 783 if (this->isImm(y.id, 0.0f)) { return x; } // x+0 == x 784 if (this->isImm(x.id, 0.0f)) { return y; } // 0+y == y 785 786 if (fFeatures.fma) { 787 if (fProgram[x.id].op == Op::mul_f32) { 788 return {this, this->push(Op::fma_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)}; 789 } 790 if (fProgram[y.id].op == Op::mul_f32) { 791 return {this, this->push(Op::fma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)}; 792 } 793 } 794 return {this, this->push(Op::add_f32, std::min(x.id, y.id), std::max(x.id, y.id))}; 795 } 796 sub(F32 x,F32 y)797 F32 Builder::sub(F32 x, F32 y) { 798 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); } 799 if (this->isImm(y.id, 0.0f)) { return x; } // x-0 == x 800 if (fFeatures.fma) { 801 if (fProgram[x.id].op == Op::mul_f32) { 802 return {this, this->push(Op::fms_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)}; 803 } 804 if (fProgram[y.id].op == Op::mul_f32) { 805 return {this, this->push(Op::fnma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)}; 806 } 807 } 808 return {this, this->push(Op::sub_f32, x.id, y.id)}; 809 } 810 mul(F32 x,F32 y)811 F32 Builder::mul(F32 x, F32 y) { 812 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); } 813 if (this->isImm(y.id, 1.0f)) { return x; } // x*1 == x 814 if (this->isImm(x.id, 1.0f)) { return y; } // 1*y == y 815 return {this, this->push(Op::mul_f32, std::min(x.id, y.id), std::max(x.id, y.id))}; 816 } 817 fast_mul(F32 x,F32 y)818 F32 Builder::fast_mul(F32 x, F32 y) { 819 if (this->isImm(x.id, 0.0f) || this->isImm(y.id, 0.0f)) { return splat(0.0f); } 820 return mul(x,y); 821 } 822 div(F32 x,F32 y)823 F32 Builder::div(F32 x, F32 y) { 824 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(sk_ieee_float_divide(X,Y)); } 825 if (this->isImm(y.id, 1.0f)) { return x; } // x/1 == x 826 return {this, this->push(Op::div_f32, x.id, y.id)}; 827 } 828 sqrt(F32 x)829 F32 Builder::sqrt(F32 x) { 830 if (float X; this->allImm(x.id,&X)) { return splat(std::sqrt(X)); } 831 return {this, this->push(Op::sqrt_f32, x.id)}; 832 } 833 834 // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. approx_log2(F32 x)835 F32 Builder::approx_log2(F32 x) { 836 // e - 127 is a fair approximation of log2(x) in its own right... 837 F32 e = mul(to_F32(pun_to_I32(x)), splat(1.0f / (1<<23))); 838 839 // ... but using the mantissa to refine its error is _much_ better. 840 F32 m = pun_to_F32(bit_or(bit_and(pun_to_I32(x), 0x007fffff), 841 0x3f000000)); 842 F32 approx = sub(e, 124.225514990f); 843 approx = sub(approx, mul(1.498030302f, m)); 844 approx = sub(approx, div(1.725879990f, add(0.3520887068f, m))); 845 846 return approx; 847 } 848 approx_pow2(F32 x)849 F32 Builder::approx_pow2(F32 x) { 850 constexpr float kInfinityBits = 0x7f800000; 851 852 F32 f = fract(x); 853 F32 approx = add(x, 121.274057500f); 854 approx = sub(approx, mul( 1.490129070f, f)); 855 approx = add(approx, div(27.728023300f, sub(4.84252568f, f))); 856 approx = mul(1.0f * (1<<23), approx); 857 approx = clamp(approx, 0, kInfinityBits); // guard against underflow/overflow 858 859 return pun_to_F32(round(approx)); 860 } 861 approx_powf(F32 x,F32 y)862 F32 Builder::approx_powf(F32 x, F32 y) { 863 // TODO: assert this instead? Sometimes x is very slightly negative. See skia:10210. 864 x = max(0.0f, x); 865 866 if (this->isImm(x.id, 1.0f)) { return x; } // 1^y is one 867 if (this->isImm(x.id, 2.0f)) { return this->approx_pow2(y); } // 2^y is pow2(y) 868 if (this->isImm(y.id, 0.5f)) { return this->sqrt(x); } // x^0.5 is sqrt(x) 869 if (this->isImm(y.id, 1.0f)) { return x; } // x^1 is x 870 if (this->isImm(y.id, 2.0f)) { return x * x; } // x^2 is x*x 871 872 auto is_x = bit_or(eq(x, 0.0f), 873 eq(x, 1.0f)); 874 return select(is_x, x, approx_pow2(mul(approx_log2(x), y))); 875 } 876 877 // Bhaskara I's sine approximation 878 // 16x(pi - x) / (5*pi^2 - 4x(pi - x) 879 // ... divide by 4 880 // 4x(pi - x) / 5*pi^2/4 - x(pi - x) 881 // 882 // This is a good approximation only for 0 <= x <= pi, so we use symmetries to get 883 // radians into that range first. 884 // approx_sin(F32 radians)885 F32 Builder::approx_sin(F32 radians) { 886 constexpr float Pi = SK_ScalarPI; 887 // x = radians mod 2pi 888 F32 x = fract(radians * (0.5f/Pi)) * (2*Pi); 889 I32 neg = x > Pi; // are we pi < x < 2pi --> need to negate result 890 x = select(neg, x - Pi, x); 891 892 F32 pair = x * (Pi - x); 893 x = 4.0f * pair / ((5*Pi*Pi/4) - pair); 894 x = select(neg, -x, x); 895 return x; 896 } 897 898 /* "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION" 899 https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf 900 901 approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9 902 903 Some simplifications: 904 1. tan(x) is periodic, -PI/2 < x < PI/2 905 2. tan(x) is odd, so tan(-x) = -tan(x) 906 3. Our polynomial approximation is best near zero, so we use the following identity 907 tan(x) + tan(y) 908 tan(x + y) = ----------------- 909 1 - tan(x)*tan(y) 910 tan(PI/4) = 1 911 912 So for x > PI/8, we do the following refactor: 913 x' = x - PI/4 914 915 1 + tan(x') 916 tan(x) = ------------ 917 1 - tan(x') 918 */ approx_tan(F32 x)919 F32 Builder::approx_tan(F32 x) { 920 constexpr float Pi = SK_ScalarPI; 921 // periodic between -pi/2 ... pi/2 922 // shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back 923 x = fract((1/Pi)*x + 0.5f) * Pi - (Pi/2); 924 925 I32 neg = (x < 0.0f); 926 x = select(neg, -x, x); 927 928 // minimize total error by shifting if x > pi/8 929 I32 use_quotient = (x > (Pi/8)); 930 x = select(use_quotient, x - (Pi/4), x); 931 932 // 9th order poly = 4th order(x^2) * x 933 x = poly(x*x, 62/2835.0f, 17/315.0f, 2/15.0f, 1/3.0f, 1.0f) * x; 934 x = select(use_quotient, (1+x)/(1-x), x); 935 x = select(neg, -x, x); 936 return x; 937 } 938 939 // http://mathforum.org/library/drmath/view/54137.html 940 // referencing Handbook of Mathematical Functions, 941 // by Milton Abramowitz and Irene Stegun approx_asin(F32 x)942 F32 Builder::approx_asin(F32 x) { 943 I32 neg = (x < 0.0f); 944 x = select(neg, -x, x); 945 x = SK_ScalarPI/2 - sqrt(1-x) * poly(x, -0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f); 946 x = select(neg, -x, x); 947 return x; 948 } 949 950 /* Use 4th order polynomial approximation from https://arachnoid.com/polysolve/ 951 * with 129 values of x,atan(x) for x:[0...1] 952 * This only works for 0 <= x <= 1 953 */ approx_atan_unit(F32 x)954 static F32 approx_atan_unit(F32 x) { 955 // for now we might be given NaN... let that through 956 x->assert_true((x != x) | ((x >= 0) & (x <= 1))); 957 return poly(x, 0.14130025741326729f, 958 -0.34312835980675116f, 959 -0.016172900528248768f, 960 1.0037696976200385f, 961 -0.00014758242182738969f); 962 } 963 964 /* Use identity atan(x) = pi/2 - atan(1/x) for x > 1 965 */ approx_atan(F32 x)966 F32 Builder::approx_atan(F32 x) { 967 I32 neg = (x < 0.0f); 968 x = select(neg, -x, x); 969 I32 flip = (x > 1.0f); 970 x = select(flip, 1/x, x); 971 x = approx_atan_unit(x); 972 x = select(flip, SK_ScalarPI/2 - x, x); 973 x = select(neg, -x, x); 974 return x; 975 } 976 977 /* Use identity atan(x) = pi/2 - atan(1/x) for x > 1 978 * By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit() 979 * which avoids a 2nd divide instruction if we had instead called atan(). 980 */ approx_atan2(F32 y0,F32 x0)981 F32 Builder::approx_atan2(F32 y0, F32 x0) { 982 983 I32 flip = (abs(y0) > abs(x0)); 984 F32 y = select(flip, x0, y0); 985 F32 x = select(flip, y0, x0); 986 F32 arg = y/x; 987 988 I32 neg = (arg < 0.0f); 989 arg = select(neg, -arg, arg); 990 991 F32 r = approx_atan_unit(arg); 992 r = select(flip, SK_ScalarPI/2 - r, r); 993 r = select(neg, -r, r); 994 995 // handle quadrant distinctions 996 r = select((y0 >= 0) & (x0 < 0), r + SK_ScalarPI, r); 997 r = select((y0 < 0) & (x0 <= 0), r - SK_ScalarPI, r); 998 // Note: we don't try to handle 0,0 or infinities (yet) 999 return r; 1000 } 1001 min(F32 x,F32 y)1002 F32 Builder::min(F32 x, F32 y) { 1003 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::min(X,Y)); } 1004 return {this, this->push(Op::min_f32, x.id, y.id)}; 1005 } max(F32 x,F32 y)1006 F32 Builder::max(F32 x, F32 y) { 1007 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::max(X,Y)); } 1008 return {this, this->push(Op::max_f32, x.id, y.id)}; 1009 } 1010 1011 SK_ATTRIBUTE(no_sanitize("signed-integer-overflow")) add(I32 x,I32 y)1012 I32 Builder::add(I32 x, I32 y) { 1013 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); } 1014 if (this->isImm(x.id, 0)) { return y; } 1015 if (this->isImm(y.id, 0)) { return x; } 1016 return {this, this->push(Op::add_i32, std::min(x.id, y.id), std::max(x.id, y.id))}; 1017 } 1018 SK_ATTRIBUTE(no_sanitize("signed-integer-overflow")) sub(I32 x,I32 y)1019 I32 Builder::sub(I32 x, I32 y) { 1020 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); } 1021 if (this->isImm(y.id, 0)) { return x; } 1022 return {this, this->push(Op::sub_i32, x.id, y.id)}; 1023 } 1024 SK_ATTRIBUTE(no_sanitize("signed-integer-overflow")) mul(I32 x,I32 y)1025 I32 Builder::mul(I32 x, I32 y) { 1026 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); } 1027 if (this->isImm(x.id, 0)) { return splat(0); } 1028 if (this->isImm(y.id, 0)) { return splat(0); } 1029 if (this->isImm(x.id, 1)) { return y; } 1030 if (this->isImm(y.id, 1)) { return x; } 1031 return {this, this->push(Op::mul_i32, std::min(x.id, y.id), std::max(x.id, y.id))}; 1032 } 1033 1034 SK_ATTRIBUTE(no_sanitize("shift")) shl(I32 x,int bits)1035 I32 Builder::shl(I32 x, int bits) { 1036 if (bits == 0) { return x; } 1037 if (int X; this->allImm(x.id,&X)) { return splat(X << bits); } 1038 return {this, this->push(Op::shl_i32, x.id,NA,NA,NA, bits)}; 1039 } shr(I32 x,int bits)1040 I32 Builder::shr(I32 x, int bits) { 1041 if (bits == 0) { return x; } 1042 if (int X; this->allImm(x.id,&X)) { return splat(unsigned(X) >> bits); } 1043 return {this, this->push(Op::shr_i32, x.id,NA,NA,NA, bits)}; 1044 } sra(I32 x,int bits)1045 I32 Builder::sra(I32 x, int bits) { 1046 if (bits == 0) { return x; } 1047 if (int X; this->allImm(x.id,&X)) { return splat(X >> bits); } 1048 return {this, this->push(Op::sra_i32, x.id,NA,NA,NA, bits)}; 1049 } 1050 eq(F32 x,F32 y)1051 I32 Builder:: eq(F32 x, F32 y) { 1052 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); } 1053 return {this, this->push(Op::eq_f32, std::min(x.id, y.id), std::max(x.id, y.id))}; 1054 } neq(F32 x,F32 y)1055 I32 Builder::neq(F32 x, F32 y) { 1056 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); } 1057 return {this, this->push(Op::neq_f32, std::min(x.id, y.id), std::max(x.id, y.id))}; 1058 } lt(F32 x,F32 y)1059 I32 Builder::lt(F32 x, F32 y) { 1060 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y> X ? ~0 : 0); } 1061 return {this, this->push(Op::gt_f32, y.id, x.id)}; 1062 } lte(F32 x,F32 y)1063 I32 Builder::lte(F32 x, F32 y) { 1064 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y>=X ? ~0 : 0); } 1065 return {this, this->push(Op::gte_f32, y.id, x.id)}; 1066 } gt(F32 x,F32 y)1067 I32 Builder::gt(F32 x, F32 y) { 1068 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); } 1069 return {this, this->push(Op::gt_f32, x.id, y.id)}; 1070 } gte(F32 x,F32 y)1071 I32 Builder::gte(F32 x, F32 y) { 1072 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); } 1073 return {this, this->push(Op::gte_f32, x.id, y.id)}; 1074 } 1075 eq(I32 x,I32 y)1076 I32 Builder:: eq(I32 x, I32 y) { 1077 if (x.id == y.id) { return splat(~0); } 1078 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); } 1079 return {this, this->push(Op:: eq_i32, std::min(x.id, y.id), std::max(x.id, y.id))}; 1080 } neq(I32 x,I32 y)1081 I32 Builder::neq(I32 x, I32 y) { 1082 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); } 1083 return ~(x == y); 1084 } gt(I32 x,I32 y)1085 I32 Builder:: gt(I32 x, I32 y) { 1086 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); } 1087 return {this, this->push(Op:: gt_i32, x.id, y.id)}; 1088 } gte(I32 x,I32 y)1089 I32 Builder::gte(I32 x, I32 y) { 1090 if (x.id == y.id) { return splat(~0); } 1091 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); } 1092 return ~(x < y); 1093 } lt(I32 x,I32 y)1094 I32 Builder:: lt(I32 x, I32 y) { return y>x; } lte(I32 x,I32 y)1095 I32 Builder::lte(I32 x, I32 y) { return y>=x; } 1096 bit_and(I32 x,I32 y)1097 I32 Builder::bit_and(I32 x, I32 y) { 1098 if (x.id == y.id) { return x; } 1099 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&Y); } 1100 if (this->isImm(y.id, 0)) { return splat(0); } // (x & false) == false 1101 if (this->isImm(x.id, 0)) { return splat(0); } // (false & y) == false 1102 if (this->isImm(y.id,~0)) { return x; } // (x & true) == x 1103 if (this->isImm(x.id,~0)) { return y; } // (true & y) == y 1104 return {this, this->push(Op::bit_and, std::min(x.id, y.id), std::max(x.id, y.id))}; 1105 } bit_or(I32 x,I32 y)1106 I32 Builder::bit_or(I32 x, I32 y) { 1107 if (x.id == y.id) { return x; } 1108 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|Y); } 1109 if (this->isImm(y.id, 0)) { return x; } // (x | false) == x 1110 if (this->isImm(x.id, 0)) { return y; } // (false | y) == y 1111 if (this->isImm(y.id,~0)) { return splat(~0); } // (x | true) == true 1112 if (this->isImm(x.id,~0)) { return splat(~0); } // (true | y) == true 1113 return {this, this->push(Op::bit_or, std::min(x.id, y.id), std::max(x.id, y.id))}; 1114 } bit_xor(I32 x,I32 y)1115 I32 Builder::bit_xor(I32 x, I32 y) { 1116 if (x.id == y.id) { return splat(0); } 1117 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X^Y); } 1118 if (this->isImm(y.id, 0)) { return x; } // (x ^ false) == x 1119 if (this->isImm(x.id, 0)) { return y; } // (false ^ y) == y 1120 return {this, this->push(Op::bit_xor, std::min(x.id, y.id), std::max(x.id, y.id))}; 1121 } 1122 bit_clear(I32 x,I32 y)1123 I32 Builder::bit_clear(I32 x, I32 y) { 1124 if (x.id == y.id) { return splat(0); } 1125 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&~Y); } 1126 if (this->isImm(y.id, 0)) { return x; } // (x & ~false) == x 1127 if (this->isImm(y.id,~0)) { return splat(0); } // (x & ~true) == false 1128 if (this->isImm(x.id, 0)) { return splat(0); } // (false & ~y) == false 1129 return {this, this->push(Op::bit_clear, x.id, y.id)}; 1130 } 1131 select(I32 x,I32 y,I32 z)1132 I32 Builder::select(I32 x, I32 y, I32 z) { 1133 if (y.id == z.id) { return y; } 1134 if (int X,Y,Z; this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return splat(X?Y:Z); } 1135 if (this->isImm(x.id,~0)) { return y; } // true ? y : z == y 1136 if (this->isImm(x.id, 0)) { return z; } // false ? y : z == z 1137 if (this->isImm(y.id, 0)) { return bit_clear(z,x); } // x ? 0 : z == ~x&z 1138 if (this->isImm(z.id, 0)) { return bit_and (y,x); } // x ? y : 0 == x&y 1139 return {this, this->push(Op::select, x.id, y.id, z.id)}; 1140 } 1141 extract(I32 x,int bits,I32 z)1142 I32 Builder::extract(I32 x, int bits, I32 z) { 1143 if (unsigned Z; this->allImm(z.id,&Z) && (~0u>>bits) == Z) { return shr(x, bits); } 1144 return bit_and(z, shr(x, bits)); 1145 } 1146 pack(I32 x,I32 y,int bits)1147 I32 Builder::pack(I32 x, I32 y, int bits) { 1148 return bit_or(x, shl(y, bits)); 1149 } 1150 ceil(F32 x)1151 F32 Builder::ceil(F32 x) { 1152 if (float X; this->allImm(x.id,&X)) { return splat(ceilf(X)); } 1153 return {this, this->push(Op::ceil, x.id)}; 1154 } floor(F32 x)1155 F32 Builder::floor(F32 x) { 1156 if (float X; this->allImm(x.id,&X)) { return splat(floorf(X)); } 1157 return {this, this->push(Op::floor, x.id)}; 1158 } to_F32(I32 x)1159 F32 Builder::to_F32(I32 x) { 1160 if (int X; this->allImm(x.id,&X)) { return splat((float)X); } 1161 return {this, this->push(Op::to_f32, x.id)}; 1162 } trunc(F32 x)1163 I32 Builder::trunc(F32 x) { 1164 if (float X; this->allImm(x.id,&X)) { return splat((int)X); } 1165 return {this, this->push(Op::trunc, x.id)}; 1166 } round(F32 x)1167 I32 Builder::round(F32 x) { 1168 if (float X; this->allImm(x.id,&X)) { return splat((int)lrintf(X)); } 1169 return {this, this->push(Op::round, x.id)}; 1170 } 1171 to_fp16(F32 x)1172 I32 Builder::to_fp16(F32 x) { 1173 if (float X; this->allImm(x.id,&X)) { return splat((int)SkFloatToHalf(X)); } 1174 return {this, this->push(Op::to_fp16, x.id)}; 1175 } from_fp16(I32 x)1176 F32 Builder::from_fp16(I32 x) { 1177 if (int X; this->allImm(x.id,&X)) { return splat(SkHalfToFloat(X)); } 1178 return {this, this->push(Op::from_fp16, x.id)}; 1179 } 1180 from_unorm(int bits,I32 x)1181 F32 Builder::from_unorm(int bits, I32 x) { 1182 F32 limit = splat(1 / ((1<<bits)-1.0f)); 1183 return mul(to_F32(x), limit); 1184 } to_unorm(int bits,F32 x)1185 I32 Builder::to_unorm(int bits, F32 x) { 1186 F32 limit = splat((1<<bits)-1.0f); 1187 return round(mul(x, limit)); 1188 } 1189 SkColorType_to_PixelFormat(SkColorType ct)1190 PixelFormat SkColorType_to_PixelFormat(SkColorType ct) { 1191 auto UNORM = PixelFormat::UNORM, 1192 SRGB = PixelFormat::SRGB, 1193 FLOAT = PixelFormat::FLOAT; 1194 switch (ct) { 1195 case kUnknown_SkColorType: break; 1196 1197 case kRGBA_F32_SkColorType: return {FLOAT,32,32,32,32, 0,32,64,96}; 1198 1199 case kRGBA_F16Norm_SkColorType: return {FLOAT,16,16,16,16, 0,16,32,48}; 1200 case kRGBA_F16_SkColorType: return {FLOAT,16,16,16,16, 0,16,32,48}; 1201 case kR16G16B16A16_unorm_SkColorType: return {UNORM,16,16,16,16, 0,16,32,48}; 1202 1203 case kA16_float_SkColorType: return {FLOAT, 0, 0,0,16, 0, 0,0,0}; 1204 case kR16G16_float_SkColorType: return {FLOAT, 16,16,0, 0, 0,16,0,0}; 1205 1206 case kAlpha_8_SkColorType: return {UNORM, 0,0,0,8, 0,0,0,0}; 1207 case kGray_8_SkColorType: return {UNORM, 8,8,8,0, 0,0,0,0}; // Subtle. 1208 case kR8_unorm_SkColorType: return {UNORM, 8,0,0,0, 0,0,0,0}; 1209 1210 case kRGB_565_SkColorType: return {UNORM, 5,6,5,0, 11,5,0,0}; // (BGR) 1211 case kARGB_4444_SkColorType: return {UNORM, 4,4,4,4, 12,8,4,0}; // (ABGR) 1212 1213 case kRGBA_8888_SkColorType: return {UNORM, 8,8,8,8, 0,8,16,24}; 1214 case kRGB_888x_SkColorType: return {UNORM, 8,8,8,0, 0,8,16,32}; // 32-bit 1215 case kBGRA_8888_SkColorType: return {UNORM, 8,8,8,8, 16,8, 0,24}; 1216 case kSRGBA_8888_SkColorType: return { SRGB, 8,8,8,8, 0,8,16,24}; 1217 1218 case kRGBA_1010102_SkColorType: return {UNORM, 10,10,10,2, 0,10,20,30}; 1219 case kBGRA_1010102_SkColorType: return {UNORM, 10,10,10,2, 20,10, 0,30}; 1220 case kRGB_101010x_SkColorType: return {UNORM, 10,10,10,0, 0,10,20, 0}; 1221 case kBGR_101010x_SkColorType: return {UNORM, 10,10,10,0, 20,10, 0, 0}; 1222 1223 case kR8G8_unorm_SkColorType: return {UNORM, 8, 8,0, 0, 0, 8,0,0}; 1224 case kR16G16_unorm_SkColorType: return {UNORM, 16,16,0, 0, 0,16,0,0}; 1225 case kA16_unorm_SkColorType: return {UNORM, 0, 0,0,16, 0, 0,0,0}; 1226 } 1227 SkASSERT(false); 1228 return {UNORM, 0,0,0,0, 0,0,0,0}; 1229 } 1230 byte_size(PixelFormat f)1231 static int byte_size(PixelFormat f) { 1232 // What's the highest bit we read? 1233 int bits = std::max(f.r_bits + f.r_shift, 1234 std::max(f.g_bits + f.g_shift, 1235 std::max(f.b_bits + f.b_shift, 1236 f.a_bits + f.a_shift))); 1237 // Round up to bytes. 1238 return (bits + 7) / 8; 1239 } 1240 unpack(PixelFormat f,I32 x)1241 static Color unpack(PixelFormat f, I32 x) { 1242 SkASSERT(byte_size(f) <= 4); 1243 1244 auto from_srgb = [](int bits, I32 channel) -> F32 { 1245 const skcms_TransferFunction* tf = skcms_sRGB_TransferFunction(); 1246 F32 v = from_unorm(bits, channel); 1247 return sk_program_transfer_fn(v, sRGBish_TF, 1248 v->splat(tf->g), 1249 v->splat(tf->a), 1250 v->splat(tf->b), 1251 v->splat(tf->c), 1252 v->splat(tf->d), 1253 v->splat(tf->e), 1254 v->splat(tf->f)); 1255 }; 1256 1257 auto unpack_rgb = [=](int bits, int shift) -> F32 { 1258 I32 channel = extract(x, shift, (1<<bits)-1); 1259 switch (f.encoding) { 1260 case PixelFormat::UNORM: return from_unorm(bits, channel); 1261 case PixelFormat:: SRGB: return from_srgb (bits, channel); 1262 case PixelFormat::FLOAT: return from_fp16 ( channel); 1263 } 1264 SkUNREACHABLE; 1265 }; 1266 auto unpack_alpha = [=](int bits, int shift) -> F32 { 1267 I32 channel = extract(x, shift, (1<<bits)-1); 1268 switch (f.encoding) { 1269 case PixelFormat::UNORM: 1270 case PixelFormat:: SRGB: return from_unorm(bits, channel); 1271 case PixelFormat::FLOAT: return from_fp16 ( channel); 1272 } 1273 SkUNREACHABLE; 1274 }; 1275 return { 1276 f.r_bits ? unpack_rgb (f.r_bits, f.r_shift) : x->splat(0.0f), 1277 f.g_bits ? unpack_rgb (f.g_bits, f.g_shift) : x->splat(0.0f), 1278 f.b_bits ? unpack_rgb (f.b_bits, f.b_shift) : x->splat(0.0f), 1279 f.a_bits ? unpack_alpha(f.a_bits, f.a_shift) : x->splat(1.0f), 1280 }; 1281 } 1282 split_disjoint_8byte_format(PixelFormat f,PixelFormat * lo,PixelFormat * hi)1283 static void split_disjoint_8byte_format(PixelFormat f, PixelFormat* lo, PixelFormat* hi) { 1284 SkASSERT(byte_size(f) == 8); 1285 // We assume some of the channels are in the low 32 bits, some in the high 32 bits. 1286 // The assert on byte_size(lo) will trigger if this assumption is violated. 1287 *lo = f; 1288 if (f.r_shift >= 32) { lo->r_bits = 0; lo->r_shift = 32; } 1289 if (f.g_shift >= 32) { lo->g_bits = 0; lo->g_shift = 32; } 1290 if (f.b_shift >= 32) { lo->b_bits = 0; lo->b_shift = 32; } 1291 if (f.a_shift >= 32) { lo->a_bits = 0; lo->a_shift = 32; } 1292 SkASSERT(byte_size(*lo) == 4); 1293 1294 *hi = f; 1295 if (f.r_shift < 32) { hi->r_bits = 0; hi->r_shift = 32; } else { hi->r_shift -= 32; } 1296 if (f.g_shift < 32) { hi->g_bits = 0; hi->g_shift = 32; } else { hi->g_shift -= 32; } 1297 if (f.b_shift < 32) { hi->b_bits = 0; hi->b_shift = 32; } else { hi->b_shift -= 32; } 1298 if (f.a_shift < 32) { hi->a_bits = 0; hi->a_shift = 32; } else { hi->a_shift -= 32; } 1299 SkASSERT(byte_size(*hi) == 4); 1300 } 1301 1302 // The only 16-byte format we support today is RGBA F32, 1303 // though, TODO, we could generalize that to any swizzle, and to allow UNORM too. assert_16byte_is_rgba_f32(PixelFormat f)1304 static void assert_16byte_is_rgba_f32(PixelFormat f) { 1305 #if defined(SK_DEBUG) 1306 SkASSERT(byte_size(f) == 16); 1307 PixelFormat rgba_f32 = SkColorType_to_PixelFormat(kRGBA_F32_SkColorType); 1308 1309 SkASSERT(f.encoding == rgba_f32.encoding); 1310 1311 SkASSERT(f.r_bits == rgba_f32.r_bits); 1312 SkASSERT(f.g_bits == rgba_f32.g_bits); 1313 SkASSERT(f.b_bits == rgba_f32.b_bits); 1314 SkASSERT(f.a_bits == rgba_f32.a_bits); 1315 1316 SkASSERT(f.r_shift == rgba_f32.r_shift); 1317 SkASSERT(f.g_shift == rgba_f32.g_shift); 1318 SkASSERT(f.b_shift == rgba_f32.b_shift); 1319 SkASSERT(f.a_shift == rgba_f32.a_shift); 1320 #endif 1321 } 1322 load(PixelFormat f,Ptr ptr)1323 Color Builder::load(PixelFormat f, Ptr ptr) { 1324 switch (byte_size(f)) { 1325 case 1: return unpack(f, load8 (ptr)); 1326 case 2: return unpack(f, load16(ptr)); 1327 case 4: return unpack(f, load32(ptr)); 1328 case 8: { 1329 PixelFormat lo,hi; 1330 split_disjoint_8byte_format(f, &lo,&hi); 1331 Color l = unpack(lo, load64(ptr, 0)), 1332 h = unpack(hi, load64(ptr, 1)); 1333 return { 1334 lo.r_bits ? l.r : h.r, 1335 lo.g_bits ? l.g : h.g, 1336 lo.b_bits ? l.b : h.b, 1337 lo.a_bits ? l.a : h.a, 1338 }; 1339 } 1340 case 16: { 1341 assert_16byte_is_rgba_f32(f); 1342 return { 1343 pun_to_F32(load128(ptr, 0)), 1344 pun_to_F32(load128(ptr, 1)), 1345 pun_to_F32(load128(ptr, 2)), 1346 pun_to_F32(load128(ptr, 3)), 1347 }; 1348 } 1349 default: SkUNREACHABLE; 1350 } 1351 return {}; 1352 } 1353 gather(PixelFormat f,UPtr ptr,int offset,I32 index)1354 Color Builder::gather(PixelFormat f, UPtr ptr, int offset, I32 index) { 1355 switch (byte_size(f)) { 1356 case 1: return unpack(f, gather8 (ptr, offset, index)); 1357 case 2: return unpack(f, gather16(ptr, offset, index)); 1358 case 4: return unpack(f, gather32(ptr, offset, index)); 1359 case 8: { 1360 PixelFormat lo,hi; 1361 split_disjoint_8byte_format(f, &lo,&hi); 1362 Color l = unpack(lo, gather32(ptr, offset, (index<<1)+0)), 1363 h = unpack(hi, gather32(ptr, offset, (index<<1)+1)); 1364 return { 1365 lo.r_bits ? l.r : h.r, 1366 lo.g_bits ? l.g : h.g, 1367 lo.b_bits ? l.b : h.b, 1368 lo.a_bits ? l.a : h.a, 1369 }; 1370 } 1371 case 16: { 1372 assert_16byte_is_rgba_f32(f); 1373 return { 1374 gatherF(ptr, offset, (index<<2)+0), 1375 gatherF(ptr, offset, (index<<2)+1), 1376 gatherF(ptr, offset, (index<<2)+2), 1377 gatherF(ptr, offset, (index<<2)+3), 1378 }; 1379 } 1380 default: SkUNREACHABLE; 1381 } 1382 return {}; 1383 } 1384 pack32(PixelFormat f,Color c)1385 static I32 pack32(PixelFormat f, Color c) { 1386 SkASSERT(byte_size(f) <= 4); 1387 1388 auto to_srgb = [](int bits, F32 v) { 1389 const skcms_TransferFunction* tf = skcms_sRGB_Inverse_TransferFunction(); 1390 return to_unorm(bits, sk_program_transfer_fn(v, sRGBish_TF, 1391 v->splat(tf->g), 1392 v->splat(tf->a), 1393 v->splat(tf->b), 1394 v->splat(tf->c), 1395 v->splat(tf->d), 1396 v->splat(tf->e), 1397 v->splat(tf->f))); 1398 }; 1399 1400 I32 packed = c->splat(0); 1401 auto pack_rgb = [&](F32 channel, int bits, int shift) { 1402 I32 encoded; 1403 switch (f.encoding) { 1404 case PixelFormat::UNORM: encoded = to_unorm(bits, channel); break; 1405 case PixelFormat:: SRGB: encoded = to_srgb (bits, channel); break; 1406 case PixelFormat::FLOAT: encoded = to_fp16 ( channel); break; 1407 } 1408 packed = pack(packed, encoded, shift); 1409 }; 1410 auto pack_alpha = [&](F32 channel, int bits, int shift) { 1411 I32 encoded; 1412 switch (f.encoding) { 1413 case PixelFormat::UNORM: 1414 case PixelFormat:: SRGB: encoded = to_unorm(bits, channel); break; 1415 case PixelFormat::FLOAT: encoded = to_fp16 ( channel); break; 1416 } 1417 packed = pack(packed, encoded, shift); 1418 }; 1419 if (f.r_bits) { pack_rgb (c.r, f.r_bits, f.r_shift); } 1420 if (f.g_bits) { pack_rgb (c.g, f.g_bits, f.g_shift); } 1421 if (f.b_bits) { pack_rgb (c.b, f.b_bits, f.b_shift); } 1422 if (f.a_bits) { pack_alpha(c.a, f.a_bits, f.a_shift); } 1423 return packed; 1424 } 1425 store(PixelFormat f,Ptr ptr,Color c)1426 void Builder::store(PixelFormat f, Ptr ptr, Color c) { 1427 // Detect a grayscale PixelFormat: r,g,b bit counts and shifts all equal. 1428 if (f.r_bits == f.g_bits && f.g_bits == f.b_bits && 1429 f.r_shift == f.g_shift && f.g_shift == f.b_shift) { 1430 1431 // TODO: pull these coefficients from an SkColorSpace? This is sRGB luma/luminance. 1432 c.r = c.r * 0.2126f 1433 + c.g * 0.7152f 1434 + c.b * 0.0722f; 1435 f.g_bits = f.b_bits = 0; 1436 } 1437 1438 switch (byte_size(f)) { 1439 case 1: store8 (ptr, pack32(f,c)); break; 1440 case 2: store16(ptr, pack32(f,c)); break; 1441 case 4: store32(ptr, pack32(f,c)); break; 1442 case 8: { 1443 PixelFormat lo,hi; 1444 split_disjoint_8byte_format(f, &lo,&hi); 1445 store64(ptr, pack32(lo,c) 1446 , pack32(hi,c)); 1447 break; 1448 } 1449 case 16: { 1450 assert_16byte_is_rgba_f32(f); 1451 store128(ptr, pun_to_I32(c.r), pun_to_I32(c.g), pun_to_I32(c.b), pun_to_I32(c.a)); 1452 break; 1453 } 1454 default: SkUNREACHABLE; 1455 } 1456 } 1457 unpremul(F32 * r,F32 * g,F32 * b,F32 a)1458 void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) { 1459 skvm::F32 invA = 1.0f / a, 1460 inf = pun_to_F32(splat(0x7f800000)); 1461 // If a is 0, so are *r,*g,*b, so set invA to 0 to avoid 0*inf=NaN (instead 0*0 = 0). 1462 invA = select(invA < inf, invA 1463 , 0.0f); 1464 *r *= invA; 1465 *g *= invA; 1466 *b *= invA; 1467 } 1468 premul(F32 * r,F32 * g,F32 * b,F32 a)1469 void Builder::premul(F32* r, F32* g, F32* b, F32 a) { 1470 *r *= a; 1471 *g *= a; 1472 *b *= a; 1473 } 1474 uniformColor(SkColor4f color,Uniforms * uniforms)1475 Color Builder::uniformColor(SkColor4f color, Uniforms* uniforms) { 1476 auto [r,g,b,a] = color; 1477 return { 1478 uniformF(uniforms->pushF(r)), 1479 uniformF(uniforms->pushF(g)), 1480 uniformF(uniforms->pushF(b)), 1481 uniformF(uniforms->pushF(a)), 1482 }; 1483 } 1484 lerp(F32 lo,F32 hi,F32 t)1485 F32 Builder::lerp(F32 lo, F32 hi, F32 t) { 1486 if (this->isImm(t.id, 0.0f)) { return lo; } 1487 if (this->isImm(t.id, 1.0f)) { return hi; } 1488 return mad(sub(hi, lo), t, lo); 1489 } 1490 lerp(Color lo,Color hi,F32 t)1491 Color Builder::lerp(Color lo, Color hi, F32 t) { 1492 return { 1493 lerp(lo.r, hi.r, t), 1494 lerp(lo.g, hi.g, t), 1495 lerp(lo.b, hi.b, t), 1496 lerp(lo.a, hi.a, t), 1497 }; 1498 } 1499 to_hsla(Color c)1500 HSLA Builder::to_hsla(Color c) { 1501 F32 mx = max(max(c.r,c.g),c.b), 1502 mn = min(min(c.r,c.g),c.b), 1503 d = mx - mn, 1504 invd = 1.0f / d, 1505 g_lt_b = select(c.g < c.b, splat(6.0f) 1506 , splat(0.0f)); 1507 1508 F32 h = (1/6.0f) * select(mx == mn, 0.0f, 1509 select(mx == c.r, invd * (c.g - c.b) + g_lt_b, 1510 select(mx == c.g, invd * (c.b - c.r) + 2.0f 1511 , invd * (c.r - c.g) + 4.0f))); 1512 1513 F32 sum = mx + mn, 1514 l = sum * 0.5f, 1515 s = select(mx == mn, 0.0f 1516 , d / select(l > 0.5f, 2.0f - sum 1517 , sum)); 1518 return {h, s, l, c.a}; 1519 } 1520 to_rgba(HSLA c)1521 Color Builder::to_rgba(HSLA c) { 1522 // See GrRGBToHSLFilterEffect.fp 1523 1524 auto [h,s,l,a] = c; 1525 F32 x = s * (1.0f - abs(l + l - 1.0f)); 1526 1527 auto hue_to_rgb = [&,l=l](auto hue) { 1528 auto q = abs(6.0f * fract(hue) - 3.0f) - 1.0f; 1529 return x * (clamp01(q) - 0.5f) + l; 1530 }; 1531 1532 return { 1533 hue_to_rgb(h + 0/3.0f), 1534 hue_to_rgb(h + 2/3.0f), 1535 hue_to_rgb(h + 1/3.0f), 1536 c.a, 1537 }; 1538 } 1539 1540 // We're basing our implementation of non-separable blend modes on 1541 // https://www.w3.org/TR/compositing-1/#blendingnonseparable. 1542 // and 1543 // https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf 1544 // They're equivalent, but ES' math has been better simplified. 1545 // 1546 // Anything extra we add beyond that is to make the math work with premul inputs. 1547 saturation(skvm::F32 r,skvm::F32 g,skvm::F32 b)1548 static skvm::F32 saturation(skvm::F32 r, skvm::F32 g, skvm::F32 b) { 1549 return max(r, max(g, b)) 1550 - min(r, min(g, b)); 1551 } 1552 luminance(skvm::F32 r,skvm::F32 g,skvm::F32 b)1553 static skvm::F32 luminance(skvm::F32 r, skvm::F32 g, skvm::F32 b) { 1554 return r*0.30f + g*0.59f + b*0.11f; 1555 } 1556 set_sat(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 s)1557 static void set_sat(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) { 1558 F32 mn = min(*r, min(*g, *b)), 1559 mx = max(*r, max(*g, *b)), 1560 sat = mx - mn; 1561 1562 // Map min channel to 0, max channel to s, and scale the middle proportionally. 1563 auto scale = [&](skvm::F32 c) { 1564 auto scaled = ((c - mn) * s) / sat; 1565 return select(is_finite(scaled), scaled, 0.0f); 1566 }; 1567 *r = scale(*r); 1568 *g = scale(*g); 1569 *b = scale(*b); 1570 } 1571 set_lum(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 lu)1572 static void set_lum(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) { 1573 auto diff = lu - luminance(*r, *g, *b); 1574 *r += diff; 1575 *g += diff; 1576 *b += diff; 1577 } 1578 clip_color(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 a)1579 static void clip_color(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) { 1580 F32 mn = min(*r, min(*g, *b)), 1581 mx = max(*r, max(*g, *b)), 1582 lu = luminance(*r, *g, *b); 1583 1584 auto clip = [&](auto c) { 1585 c = select(mn >= 0, c 1586 , lu + ((c-lu)*( lu)) / (lu-mn)); 1587 c = select(mx > a, lu + ((c-lu)*(a-lu)) / (mx-lu) 1588 , c); 1589 return clamp01(c); // May be a little negative, or worse, NaN. 1590 }; 1591 *r = clip(*r); 1592 *g = clip(*g); 1593 *b = clip(*b); 1594 } 1595 blend(SkBlendMode mode,Color src,Color dst)1596 Color Builder::blend(SkBlendMode mode, Color src, Color dst) { 1597 auto mma = [](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) { 1598 return x*y + z*w; 1599 }; 1600 1601 auto two = [](skvm::F32 x) { return x+x; }; 1602 1603 auto apply_rgba = [&](auto fn) { 1604 return Color { 1605 fn(src.r, dst.r), 1606 fn(src.g, dst.g), 1607 fn(src.b, dst.b), 1608 fn(src.a, dst.a), 1609 }; 1610 }; 1611 1612 auto apply_rgb_srcover_a = [&](auto fn) { 1613 return Color { 1614 fn(src.r, dst.r), 1615 fn(src.g, dst.g), 1616 fn(src.b, dst.b), 1617 mad(dst.a, 1-src.a, src.a), // srcover for alpha 1618 }; 1619 }; 1620 1621 auto non_sep = [&](auto R, auto G, auto B) { 1622 return Color{ 1623 R + mma(src.r, 1-dst.a, dst.r, 1-src.a), 1624 G + mma(src.g, 1-dst.a, dst.g, 1-src.a), 1625 B + mma(src.b, 1-dst.a, dst.b, 1-src.a), 1626 mad(dst.a, 1-src.a, src.a), // srcover for alpha 1627 }; 1628 }; 1629 1630 switch (mode) { 1631 default: 1632 SkASSERT(false); 1633 [[fallthrough]]; /*but also, for safety, fallthrough*/ 1634 1635 case SkBlendMode::kClear: return { splat(0.0f), splat(0.0f), splat(0.0f), splat(0.0f) }; 1636 1637 case SkBlendMode::kSrc: return src; 1638 case SkBlendMode::kDst: return dst; 1639 1640 case SkBlendMode::kDstOver: std::swap(src, dst); [[fallthrough]]; 1641 case SkBlendMode::kSrcOver: 1642 return apply_rgba([&](auto s, auto d) { 1643 return mad(d,1-src.a, s); 1644 }); 1645 1646 case SkBlendMode::kDstIn: std::swap(src, dst); [[fallthrough]]; 1647 case SkBlendMode::kSrcIn: 1648 return apply_rgba([&](auto s, auto d) { 1649 return s * dst.a; 1650 }); 1651 1652 case SkBlendMode::kDstOut: std::swap(src, dst); [[fallthrough]]; 1653 1654 case SkBlendMode::kSrcOut: 1655 return apply_rgba([&](auto s, auto d) { 1656 return s * (1-dst.a); 1657 }); 1658 1659 case SkBlendMode::kDstATop: std::swap(src, dst); [[fallthrough]]; 1660 case SkBlendMode::kSrcATop: 1661 return apply_rgba([&](auto s, auto d) { 1662 return mma(s, dst.a, d, 1-src.a); 1663 }); 1664 1665 case SkBlendMode::kXor: 1666 return apply_rgba([&](auto s, auto d) { 1667 return mma(s, 1-dst.a, d, 1-src.a); 1668 }); 1669 1670 case SkBlendMode::kPlus: 1671 return apply_rgba([&](auto s, auto d) { 1672 return min(s+d, 1.0f); 1673 }); 1674 1675 case SkBlendMode::kModulate: 1676 return apply_rgba([&](auto s, auto d) { 1677 return s * d; 1678 }); 1679 1680 case SkBlendMode::kScreen: 1681 // (s+d)-(s*d) gave us trouble with our "r,g,b <= after blending" asserts. 1682 // It's kind of plausible that s + (d - sd) keeps more precision? 1683 return apply_rgba([&](auto s, auto d) { 1684 return s + (d - s*d); 1685 }); 1686 1687 case SkBlendMode::kDarken: 1688 return apply_rgb_srcover_a([&](auto s, auto d) { 1689 return s + (d - max(s * dst.a, 1690 d * src.a)); 1691 }); 1692 1693 case SkBlendMode::kLighten: 1694 return apply_rgb_srcover_a([&](auto s, auto d) { 1695 return s + (d - min(s * dst.a, 1696 d * src.a)); 1697 }); 1698 1699 case SkBlendMode::kDifference: 1700 return apply_rgb_srcover_a([&](auto s, auto d) { 1701 return s + (d - two(min(s * dst.a, 1702 d * src.a))); 1703 }); 1704 1705 case SkBlendMode::kExclusion: 1706 return apply_rgb_srcover_a([&](auto s, auto d) { 1707 return s + (d - two(s * d)); 1708 }); 1709 1710 case SkBlendMode::kColorBurn: 1711 return apply_rgb_srcover_a([&](auto s, auto d) { 1712 auto mn = min(dst.a, 1713 src.a * (dst.a - d) / s), 1714 burn = src.a * (dst.a - mn) + mma(s, 1-dst.a, d, 1-src.a); 1715 return select(d == dst.a , s * (1-dst.a) + d, 1716 select(is_finite(burn), burn 1717 , d * (1-src.a) + s)); 1718 }); 1719 1720 case SkBlendMode::kColorDodge: 1721 return apply_rgb_srcover_a([&](auto s, auto d) { 1722 auto dodge = src.a * min(dst.a, 1723 d * src.a / (src.a - s)) 1724 + mma(s, 1-dst.a, d, 1-src.a); 1725 return select(d == 0.0f , s * (1-dst.a) + d, 1726 select(is_finite(dodge), dodge 1727 , d * (1-src.a) + s)); 1728 }); 1729 1730 case SkBlendMode::kHardLight: 1731 return apply_rgb_srcover_a([&](auto s, auto d) { 1732 return mma(s, 1-dst.a, d, 1-src.a) + 1733 select(two(s) <= src.a, 1734 two(s * d), 1735 src.a * dst.a - two((dst.a - d) * (src.a - s))); 1736 }); 1737 1738 case SkBlendMode::kOverlay: 1739 return apply_rgb_srcover_a([&](auto s, auto d) { 1740 return mma(s, 1-dst.a, d, 1-src.a) + 1741 select(two(d) <= dst.a, 1742 two(s * d), 1743 src.a * dst.a - two((dst.a - d) * (src.a - s))); 1744 }); 1745 1746 case SkBlendMode::kMultiply: 1747 return apply_rgba([&](auto s, auto d) { 1748 return mma(s, 1-dst.a, d, 1-src.a) + s * d; 1749 }); 1750 1751 case SkBlendMode::kSoftLight: 1752 return apply_rgb_srcover_a([&](auto s, auto d) { 1753 auto m = select(dst.a > 0.0f, d / dst.a 1754 , 0.0f), 1755 s2 = two(s), 1756 m4 = 4*m; 1757 1758 // The logic forks three ways: 1759 // 1. dark src? 1760 // 2. light src, dark dst? 1761 // 3. light src, light dst? 1762 1763 // Used in case 1 1764 auto darkSrc = d * ((s2-src.a) * (1-m) + src.a), 1765 // Used in case 2 1766 darkDst = (m4 * m4 + m4) * (m-1) + 7*m, 1767 // Used in case 3. 1768 liteDst = sqrt(m) - m, 1769 // Used in 2 or 3? 1770 liteSrc = dst.a * (s2 - src.a) * select(4*d <= dst.a, darkDst 1771 , liteDst) 1772 + d * src.a; 1773 return s * (1-dst.a) + d * (1-src.a) + select(s2 <= src.a, darkSrc 1774 , liteSrc); 1775 }); 1776 1777 case SkBlendMode::kHue: { 1778 skvm::F32 R = src.r * src.a, 1779 G = src.g * src.a, 1780 B = src.b * src.a; 1781 1782 set_sat (&R, &G, &B, src.a * saturation(dst.r, dst.g, dst.b)); 1783 set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b)); 1784 clip_color(&R, &G, &B, src.a * dst.a); 1785 1786 return non_sep(R, G, B); 1787 } 1788 1789 case SkBlendMode::kSaturation: { 1790 skvm::F32 R = dst.r * src.a, 1791 G = dst.g * src.a, 1792 B = dst.b * src.a; 1793 1794 set_sat (&R, &G, &B, dst.a * saturation(src.r, src.g, src.b)); 1795 set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b)); 1796 clip_color(&R, &G, &B, src.a * dst.a); 1797 1798 return non_sep(R, G, B); 1799 } 1800 1801 case SkBlendMode::kColor: { 1802 skvm::F32 R = src.r * dst.a, 1803 G = src.g * dst.a, 1804 B = src.b * dst.a; 1805 1806 set_lum (&R, &G, &B, src.a * luminance(dst.r, dst.g, dst.b)); 1807 clip_color(&R, &G, &B, src.a * dst.a); 1808 1809 return non_sep(R, G, B); 1810 } 1811 1812 case SkBlendMode::kLuminosity: { 1813 skvm::F32 R = dst.r * src.a, 1814 G = dst.g * src.a, 1815 B = dst.b * src.a; 1816 1817 set_lum (&R, &G, &B, dst.a * luminance(src.r, src.g, src.b)); 1818 clip_color(&R, &G, &B, dst.a * src.a); 1819 1820 return non_sep(R, G, B); 1821 } 1822 } 1823 } 1824 1825 // ~~~~ Program::eval() and co. ~~~~ // 1826 1827 // Handy references for x86-64 instruction encoding: 1828 // https://wiki.osdev.org/X86-64_Instruction_Encoding 1829 // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm 1830 // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm 1831 // http://ref.x86asm.net/coder64.html 1832 1833 // Used for ModRM / immediate instruction encoding. _233(int a,int b,int c)1834 static uint8_t _233(int a, int b, int c) { 1835 return (a & 3) << 6 1836 | (b & 7) << 3 1837 | (c & 7) << 0; 1838 } 1839 1840 // ModRM byte encodes the arguments of an opcode. 1841 enum class Mod { Indirect, OneByteImm, FourByteImm, Direct }; mod_rm(Mod mod,int reg,int rm)1842 static uint8_t mod_rm(Mod mod, int reg, int rm) { 1843 return _233((int)mod, reg, rm); 1844 } 1845 mod(int imm)1846 static Mod mod(int imm) { 1847 if (imm == 0) { return Mod::Indirect; } 1848 if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; } 1849 return Mod::FourByteImm; 1850 } 1851 imm_bytes(Mod mod)1852 static int imm_bytes(Mod mod) { 1853 switch (mod) { 1854 case Mod::Indirect: return 0; 1855 case Mod::OneByteImm: return 1; 1856 case Mod::FourByteImm: return 4; 1857 case Mod::Direct: SkUNREACHABLE; 1858 } 1859 SkUNREACHABLE; 1860 } 1861 1862 // SIB byte encodes a memory address, base + (index * scale). sib(Assembler::Scale scale,int index,int base)1863 static uint8_t sib(Assembler::Scale scale, int index, int base) { 1864 return _233((int)scale, index, base); 1865 } 1866 1867 // The REX prefix is used to extend most old 32-bit instructions to 64-bit. rex(bool W,bool R,bool X,bool B)1868 static uint8_t rex(bool W, // If set, operation is 64-bit, otherwise default, usually 32-bit. 1869 bool R, // Extra top bit to select ModRM reg, registers 8-15. 1870 bool X, // Extra top bit for SIB index register. 1871 bool B) { // Extra top bit for SIB base or ModRM rm register. 1872 return 0b01000000 // Fixed 0100 for top four bits. 1873 | (W << 3) 1874 | (R << 2) 1875 | (X << 1) 1876 | (B << 0); 1877 } 1878 1879 1880 // The VEX prefix extends SSE operations to AVX. Used generally, even with XMM. 1881 struct VEX { 1882 int len; 1883 uint8_t bytes[3]; 1884 }; 1885 vex(bool WE,bool R,bool X,bool B,int map,int vvvv,bool L,int pp)1886 static VEX vex(bool WE, // Like REX W for int operations, or opcode extension for float? 1887 bool R, // Same as REX R. Pass high bit of dst register, dst>>3. 1888 bool X, // Same as REX X. 1889 bool B, // Same as REX B. Pass y>>3 for 3-arg ops, x>>3 for 2-arg. 1890 int map, // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f. 1891 int vvvv, // 4-bit second operand register. Pass our x for 3-arg ops. 1892 bool L, // Set for 256-bit ymm operations, off for 128-bit xmm. 1893 int pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none. 1894 1895 // Pack x86 opcode map selector to 5-bit VEX encoding. 1896 map = [map]{ 1897 switch (map) { 1898 case 0x0f: return 0b00001; 1899 case 0x380f: return 0b00010; 1900 case 0x3a0f: return 0b00011; 1901 // Several more cases only used by XOP / TBM. 1902 } 1903 SkUNREACHABLE; 1904 }(); 1905 1906 // Pack mandatory SSE opcode prefix byte to 2-bit VEX encoding. 1907 pp = [pp]{ 1908 switch (pp) { 1909 case 0x66: return 0b01; 1910 case 0xf3: return 0b10; 1911 case 0xf2: return 0b11; 1912 } 1913 return 0b00; 1914 }(); 1915 1916 VEX vex = {0, {0,0,0}}; 1917 if (X == 0 && B == 0 && WE == 0 && map == 0b00001) { 1918 // With these conditions met, we can optionally compress VEX to 2-byte. 1919 vex.len = 2; 1920 vex.bytes[0] = 0xc5; 1921 vex.bytes[1] = (pp & 3) << 0 1922 | (L & 1) << 2 1923 | (~vvvv & 15) << 3 1924 | (~(int)R & 1) << 7; 1925 } else { 1926 // We could use this 3-byte VEX prefix all the time if we like. 1927 vex.len = 3; 1928 vex.bytes[0] = 0xc4; 1929 vex.bytes[1] = (map & 31) << 0 1930 | (~(int)B & 1) << 5 1931 | (~(int)X & 1) << 6 1932 | (~(int)R & 1) << 7; 1933 vex.bytes[2] = (pp & 3) << 0 1934 | (L & 1) << 2 1935 | (~vvvv & 15) << 3 1936 | (WE & 1) << 7; 1937 } 1938 return vex; 1939 } 1940 Assembler(void * buf)1941 Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fSize(0) {} 1942 size() const1943 size_t Assembler::size() const { return fSize; } 1944 bytes(const void * p,int n)1945 void Assembler::bytes(const void* p, int n) { 1946 if (fCode) { 1947 memcpy(fCode+fSize, p, n); 1948 } 1949 fSize += n; 1950 } 1951 byte(uint8_t b)1952 void Assembler::byte(uint8_t b) { this->bytes(&b, 1); } word(uint32_t w)1953 void Assembler::word(uint32_t w) { this->bytes(&w, 4); } 1954 align(int mod)1955 void Assembler::align(int mod) { 1956 while (this->size() % mod) { 1957 this->byte(0x00); 1958 } 1959 } 1960 int3()1961 void Assembler::int3() { 1962 this->byte(0xcc); 1963 } 1964 vzeroupper()1965 void Assembler::vzeroupper() { 1966 this->byte(0xc5); 1967 this->byte(0xf8); 1968 this->byte(0x77); 1969 } ret()1970 void Assembler::ret() { this->byte(0xc3); } 1971 op(int opcode,Operand dst,GP64 x)1972 void Assembler::op(int opcode, Operand dst, GP64 x) { 1973 if (dst.kind == Operand::REG) { 1974 this->byte(rex(W1,x>>3,0,dst.reg>>3)); 1975 this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2); 1976 this->byte(mod_rm(Mod::Direct, x, dst.reg&7)); 1977 } else { 1978 SkASSERT(dst.kind == Operand::MEM); 1979 const Mem& m = dst.mem; 1980 const bool need_SIB = (m.base&7) == rsp 1981 || m.index != rsp; 1982 1983 this->byte(rex(W1,x>>3,m.index>>3,m.base>>3)); 1984 this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2); 1985 this->byte(mod_rm(mod(m.disp), x&7, (need_SIB ? rsp : m.base)&7)); 1986 if (need_SIB) { 1987 this->byte(sib(m.scale, m.index&7, m.base&7)); 1988 } 1989 this->bytes(&m.disp, imm_bytes(mod(m.disp))); 1990 } 1991 } 1992 op(int opcode,int opcode_ext,Operand dst,int imm)1993 void Assembler::op(int opcode, int opcode_ext, Operand dst, int imm) { 1994 opcode |= 0b1000'0000; // top bit set for instructions with any immediate 1995 1996 int imm_bytes = 4; 1997 if (SkTFitsIn<int8_t>(imm)) { 1998 imm_bytes = 1; 1999 opcode |= 0b0000'0010; // second bit set for 8-bit immediate, else 32-bit. 2000 } 2001 2002 this->op(opcode, dst, (GP64)opcode_ext); 2003 this->bytes(&imm, imm_bytes); 2004 } 2005 add(Operand dst,int imm)2006 void Assembler::add(Operand dst, int imm) { this->op(0x01,0b000, dst,imm); } sub(Operand dst,int imm)2007 void Assembler::sub(Operand dst, int imm) { this->op(0x01,0b101, dst,imm); } cmp(Operand dst,int imm)2008 void Assembler::cmp(Operand dst, int imm) { this->op(0x01,0b111, dst,imm); } 2009 2010 // These don't work quite like the other instructions with immediates: 2011 // these immediates are always fixed size at 4 bytes or 1 byte. mov(Operand dst,int imm)2012 void Assembler::mov(Operand dst, int imm) { 2013 this->op(0xC7,dst,(GP64)0b000); 2014 this->word(imm); 2015 } movb(Operand dst,int imm)2016 void Assembler::movb(Operand dst, int imm) { 2017 this->op(0xC6,dst,(GP64)0b000); 2018 this->byte(imm); 2019 } 2020 add(Operand dst,GP64 x)2021 void Assembler::add (Operand dst, GP64 x) { this->op(0x01, dst,x); } sub(Operand dst,GP64 x)2022 void Assembler::sub (Operand dst, GP64 x) { this->op(0x29, dst,x); } cmp(Operand dst,GP64 x)2023 void Assembler::cmp (Operand dst, GP64 x) { this->op(0x39, dst,x); } mov(Operand dst,GP64 x)2024 void Assembler::mov (Operand dst, GP64 x) { this->op(0x89, dst,x); } movb(Operand dst,GP64 x)2025 void Assembler::movb(Operand dst, GP64 x) { this->op(0x88, dst,x); } 2026 add(GP64 dst,Operand x)2027 void Assembler::add (GP64 dst, Operand x) { this->op(0x03, x,dst); } sub(GP64 dst,Operand x)2028 void Assembler::sub (GP64 dst, Operand x) { this->op(0x2B, x,dst); } cmp(GP64 dst,Operand x)2029 void Assembler::cmp (GP64 dst, Operand x) { this->op(0x3B, x,dst); } mov(GP64 dst,Operand x)2030 void Assembler::mov (GP64 dst, Operand x) { this->op(0x8B, x,dst); } movb(GP64 dst,Operand x)2031 void Assembler::movb(GP64 dst, Operand x) { this->op(0x8A, x,dst); } 2032 movzbq(GP64 dst,Operand x)2033 void Assembler::movzbq(GP64 dst, Operand x) { this->op(0xB60F, x,dst); } movzwq(GP64 dst,Operand x)2034 void Assembler::movzwq(GP64 dst, Operand x) { this->op(0xB70F, x,dst); } 2035 vpaddd(Ymm dst,Ymm x,Operand y)2036 void Assembler::vpaddd (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfe, dst,x,y); } vpsubd(Ymm dst,Ymm x,Operand y)2037 void Assembler::vpsubd (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfa, dst,x,y); } vpmulld(Ymm dst,Ymm x,Operand y)2038 void Assembler::vpmulld(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x40, dst,x,y); } 2039 vpaddw(Ymm dst,Ymm x,Operand y)2040 void Assembler::vpaddw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfd, dst,x,y); } vpsubw(Ymm dst,Ymm x,Operand y)2041 void Assembler::vpsubw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xf9, dst,x,y); } vpmullw(Ymm dst,Ymm x,Operand y)2042 void Assembler::vpmullw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xd5, dst,x,y); } vpavgw(Ymm dst,Ymm x,Operand y)2043 void Assembler::vpavgw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xe3, dst,x,y); } vpmulhrsw(Ymm dst,Ymm x,Operand y)2044 void Assembler::vpmulhrsw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x0b, dst,x,y); } vpminsw(Ymm dst,Ymm x,Operand y)2045 void Assembler::vpminsw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xea, dst,x,y); } vpmaxsw(Ymm dst,Ymm x,Operand y)2046 void Assembler::vpmaxsw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xee, dst,x,y); } vpminuw(Ymm dst,Ymm x,Operand y)2047 void Assembler::vpminuw (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3a, dst,x,y); } vpmaxuw(Ymm dst,Ymm x,Operand y)2048 void Assembler::vpmaxuw (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3e, dst,x,y); } 2049 vpabsw(Ymm dst,Operand x)2050 void Assembler::vpabsw(Ymm dst, Operand x) { this->op(0x66,0x380f,0x1d, dst,x); } 2051 2052 vpand(Ymm dst,Ymm x,Operand y)2053 void Assembler::vpand (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdb, dst,x,y); } vpor(Ymm dst,Ymm x,Operand y)2054 void Assembler::vpor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xeb, dst,x,y); } vpxor(Ymm dst,Ymm x,Operand y)2055 void Assembler::vpxor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xef, dst,x,y); } vpandn(Ymm dst,Ymm x,Operand y)2056 void Assembler::vpandn(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdf, dst,x,y); } 2057 vaddps(Ymm dst,Ymm x,Operand y)2058 void Assembler::vaddps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x58, dst,x,y); } vsubps(Ymm dst,Ymm x,Operand y)2059 void Assembler::vsubps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5c, dst,x,y); } vmulps(Ymm dst,Ymm x,Operand y)2060 void Assembler::vmulps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x59, dst,x,y); } vdivps(Ymm dst,Ymm x,Operand y)2061 void Assembler::vdivps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5e, dst,x,y); } vminps(Ymm dst,Ymm x,Operand y)2062 void Assembler::vminps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5d, dst,x,y); } vmaxps(Ymm dst,Ymm x,Operand y)2063 void Assembler::vmaxps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5f, dst,x,y); } 2064 vfmadd132ps(Ymm dst,Ymm x,Operand y)2065 void Assembler::vfmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x98, dst,x,y); } vfmadd213ps(Ymm dst,Ymm x,Operand y)2066 void Assembler::vfmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xa8, dst,x,y); } vfmadd231ps(Ymm dst,Ymm x,Operand y)2067 void Assembler::vfmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xb8, dst,x,y); } 2068 vfmsub132ps(Ymm dst,Ymm x,Operand y)2069 void Assembler::vfmsub132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9a, dst,x,y); } vfmsub213ps(Ymm dst,Ymm x,Operand y)2070 void Assembler::vfmsub213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xaa, dst,x,y); } vfmsub231ps(Ymm dst,Ymm x,Operand y)2071 void Assembler::vfmsub231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xba, dst,x,y); } 2072 vfnmadd132ps(Ymm dst,Ymm x,Operand y)2073 void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9c, dst,x,y); } vfnmadd213ps(Ymm dst,Ymm x,Operand y)2074 void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xac, dst,x,y); } vfnmadd231ps(Ymm dst,Ymm x,Operand y)2075 void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xbc, dst,x,y); } 2076 vpackusdw(Ymm dst,Ymm x,Operand y)2077 void Assembler::vpackusdw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x2b, dst,x,y); } vpackuswb(Ymm dst,Ymm x,Operand y)2078 void Assembler::vpackuswb(Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0x67, dst,x,y); } 2079 vpunpckldq(Ymm dst,Ymm x,Operand y)2080 void Assembler::vpunpckldq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x62, dst,x,y); } vpunpckhdq(Ymm dst,Ymm x,Operand y)2081 void Assembler::vpunpckhdq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x6a, dst,x,y); } 2082 vpcmpeqd(Ymm dst,Ymm x,Operand y)2083 void Assembler::vpcmpeqd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x76, dst,x,y); } vpcmpeqw(Ymm dst,Ymm x,Operand y)2084 void Assembler::vpcmpeqw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x75, dst,x,y); } vpcmpgtd(Ymm dst,Ymm x,Operand y)2085 void Assembler::vpcmpgtd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x66, dst,x,y); } vpcmpgtw(Ymm dst,Ymm x,Operand y)2086 void Assembler::vpcmpgtw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x65, dst,x,y); } 2087 2088 imm_byte_after_operand(const Operand & operand,int imm)2089 void Assembler::imm_byte_after_operand(const Operand& operand, int imm) { 2090 // When we've embedded a label displacement in the middle of an instruction, 2091 // we need to tweak it a little so that the resolved displacement starts 2092 // from the end of the instruction and not the end of the displacement. 2093 if (operand.kind == Operand::LABEL && fCode) { 2094 int disp; 2095 memcpy(&disp, fCode+fSize-4, 4); 2096 disp--; 2097 memcpy(fCode+fSize-4, &disp, 4); 2098 } 2099 this->byte(imm); 2100 } 2101 vcmpps(Ymm dst,Ymm x,Operand y,int imm)2102 void Assembler::vcmpps(Ymm dst, Ymm x, Operand y, int imm) { 2103 this->op(0,0x0f,0xc2, dst,x,y); 2104 this->imm_byte_after_operand(y, imm); 2105 } 2106 vpblendvb(Ymm dst,Ymm x,Operand y,Ymm z)2107 void Assembler::vpblendvb(Ymm dst, Ymm x, Operand y, Ymm z) { 2108 this->op(0x66,0x3a0f,0x4c, dst,x,y); 2109 this->imm_byte_after_operand(y, z << 4); 2110 } 2111 2112 // Shift instructions encode their opcode extension as "dst", dst as x, and x as y. vpslld(Ymm dst,Ymm x,int imm)2113 void Assembler::vpslld(Ymm dst, Ymm x, int imm) { 2114 this->op(0x66,0x0f,0x72,(Ymm)6, dst,x); 2115 this->byte(imm); 2116 } vpsrld(Ymm dst,Ymm x,int imm)2117 void Assembler::vpsrld(Ymm dst, Ymm x, int imm) { 2118 this->op(0x66,0x0f,0x72,(Ymm)2, dst,x); 2119 this->byte(imm); 2120 } vpsrad(Ymm dst,Ymm x,int imm)2121 void Assembler::vpsrad(Ymm dst, Ymm x, int imm) { 2122 this->op(0x66,0x0f,0x72,(Ymm)4, dst,x); 2123 this->byte(imm); 2124 } vpsllw(Ymm dst,Ymm x,int imm)2125 void Assembler::vpsllw(Ymm dst, Ymm x, int imm) { 2126 this->op(0x66,0x0f,0x71,(Ymm)6, dst,x); 2127 this->byte(imm); 2128 } vpsrlw(Ymm dst,Ymm x,int imm)2129 void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) { 2130 this->op(0x66,0x0f,0x71,(Ymm)2, dst,x); 2131 this->byte(imm); 2132 } vpsraw(Ymm dst,Ymm x,int imm)2133 void Assembler::vpsraw(Ymm dst, Ymm x, int imm) { 2134 this->op(0x66,0x0f,0x71,(Ymm)4, dst,x); 2135 this->byte(imm); 2136 } 2137 vpermq(Ymm dst,Operand x,int imm)2138 void Assembler::vpermq(Ymm dst, Operand x, int imm) { 2139 // A bit unusual among the instructions we use, this is 64-bit operation, so we set W. 2140 this->op(0x66,0x3a0f,0x00, dst,x,W1); 2141 this->imm_byte_after_operand(x, imm); 2142 } 2143 vperm2f128(Ymm dst,Ymm x,Operand y,int imm)2144 void Assembler::vperm2f128(Ymm dst, Ymm x, Operand y, int imm) { 2145 this->op(0x66,0x3a0f,0x06, dst,x,y); 2146 this->imm_byte_after_operand(y, imm); 2147 } 2148 vpermps(Ymm dst,Ymm ix,Operand src)2149 void Assembler::vpermps(Ymm dst, Ymm ix, Operand src) { 2150 this->op(0x66,0x380f,0x16, dst,ix,src); 2151 } 2152 vroundps(Ymm dst,Operand x,Rounding imm)2153 void Assembler::vroundps(Ymm dst, Operand x, Rounding imm) { 2154 this->op(0x66,0x3a0f,0x08, dst,x); 2155 this->imm_byte_after_operand(x, imm); 2156 } 2157 vmovdqa(Ymm dst,Operand src)2158 void Assembler::vmovdqa(Ymm dst, Operand src) { this->op(0x66,0x0f,0x6f, dst,src); } vmovups(Ymm dst,Operand src)2159 void Assembler::vmovups(Ymm dst, Operand src) { this->op( 0,0x0f,0x10, dst,src); } vmovups(Xmm dst,Operand src)2160 void Assembler::vmovups(Xmm dst, Operand src) { this->op( 0,0x0f,0x10, dst,src); } vmovups(Operand dst,Ymm src)2161 void Assembler::vmovups(Operand dst, Ymm src) { this->op( 0,0x0f,0x11, src,dst); } vmovups(Operand dst,Xmm src)2162 void Assembler::vmovups(Operand dst, Xmm src) { this->op( 0,0x0f,0x11, src,dst); } 2163 vcvtdq2ps(Ymm dst,Operand x)2164 void Assembler::vcvtdq2ps (Ymm dst, Operand x) { this->op( 0,0x0f,0x5b, dst,x); } vcvttps2dq(Ymm dst,Operand x)2165 void Assembler::vcvttps2dq(Ymm dst, Operand x) { this->op(0xf3,0x0f,0x5b, dst,x); } vcvtps2dq(Ymm dst,Operand x)2166 void Assembler::vcvtps2dq (Ymm dst, Operand x) { this->op(0x66,0x0f,0x5b, dst,x); } vsqrtps(Ymm dst,Operand x)2167 void Assembler::vsqrtps (Ymm dst, Operand x) { this->op( 0,0x0f,0x51, dst,x); } 2168 vcvtps2ph(Operand dst,Ymm x,Rounding imm)2169 void Assembler::vcvtps2ph(Operand dst, Ymm x, Rounding imm) { 2170 this->op(0x66,0x3a0f,0x1d, x,dst); 2171 this->imm_byte_after_operand(dst, imm); 2172 } vcvtph2ps(Ymm dst,Operand x)2173 void Assembler::vcvtph2ps(Ymm dst, Operand x) { 2174 this->op(0x66,0x380f,0x13, dst,x); 2175 } 2176 disp19(Label * l)2177 int Assembler::disp19(Label* l) { 2178 SkASSERT(l->kind == Label::NotYetSet || 2179 l->kind == Label::ARMDisp19); 2180 int here = (int)this->size(); 2181 l->kind = Label::ARMDisp19; 2182 l->references.push_back(here); 2183 // ARM 19-bit instruction count, from the beginning of this instruction. 2184 return (l->offset - here) / 4; 2185 } 2186 disp32(Label * l)2187 int Assembler::disp32(Label* l) { 2188 SkASSERT(l->kind == Label::NotYetSet || 2189 l->kind == Label::X86Disp32); 2190 int here = (int)this->size(); 2191 l->kind = Label::X86Disp32; 2192 l->references.push_back(here); 2193 // x86 32-bit byte count, from the end of this instruction. 2194 return l->offset - (here + 4); 2195 } 2196 op(int prefix,int map,int opcode,int dst,int x,Operand y,W w,L l)2197 void Assembler::op(int prefix, int map, int opcode, int dst, int x, Operand y, W w, L l) { 2198 switch (y.kind) { 2199 case Operand::REG: { 2200 VEX v = vex(w, dst>>3, 0, y.reg>>3, 2201 map, x, l, prefix); 2202 this->bytes(v.bytes, v.len); 2203 this->byte(opcode); 2204 this->byte(mod_rm(Mod::Direct, dst&7, y.reg&7)); 2205 } return; 2206 2207 case Operand::MEM: { 2208 // Passing rsp as the rm argument to mod_rm() signals an SIB byte follows; 2209 // without an SIB byte, that's where the base register would usually go. 2210 // This means we have to use an SIB byte if we want to use rsp as a base register. 2211 const Mem& m = y.mem; 2212 const bool need_SIB = m.base == rsp 2213 || m.index != rsp; 2214 2215 VEX v = vex(w, dst>>3, m.index>>3, m.base>>3, 2216 map, x, l, prefix); 2217 this->bytes(v.bytes, v.len); 2218 this->byte(opcode); 2219 this->byte(mod_rm(mod(m.disp), dst&7, (need_SIB ? rsp : m.base)&7)); 2220 if (need_SIB) { 2221 this->byte(sib(m.scale, m.index&7, m.base&7)); 2222 } 2223 this->bytes(&m.disp, imm_bytes(mod(m.disp))); 2224 } return; 2225 2226 case Operand::LABEL: { 2227 // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13. 2228 const int rip = rbp; 2229 2230 VEX v = vex(w, dst>>3, 0, rip>>3, 2231 map, x, l, prefix); 2232 this->bytes(v.bytes, v.len); 2233 this->byte(opcode); 2234 this->byte(mod_rm(Mod::Indirect, dst&7, rip&7)); 2235 this->word(this->disp32(y.label)); 2236 } return; 2237 } 2238 } 2239 vpshufb(Ymm dst,Ymm x,Operand y)2240 void Assembler::vpshufb(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x00, dst,x,y); } 2241 vptest(Ymm x,Operand y)2242 void Assembler::vptest(Ymm x, Operand y) { this->op(0x66, 0x380f, 0x17, x,y); } 2243 vbroadcastss(Ymm dst,Operand y)2244 void Assembler::vbroadcastss(Ymm dst, Operand y) { this->op(0x66,0x380f,0x18, dst,y); } 2245 jump(uint8_t condition,Label * l)2246 void Assembler::jump(uint8_t condition, Label* l) { 2247 // These conditional jumps can be either 2 bytes (short) or 6 bytes (near): 2248 // 7? one-byte-disp 2249 // 0F 8? four-byte-disp 2250 // We always use the near displacement to make updating labels simpler (no resizing). 2251 this->byte(0x0f); 2252 this->byte(condition); 2253 this->word(this->disp32(l)); 2254 } je(Label * l)2255 void Assembler::je (Label* l) { this->jump(0x84, l); } jne(Label * l)2256 void Assembler::jne(Label* l) { this->jump(0x85, l); } jl(Label * l)2257 void Assembler::jl (Label* l) { this->jump(0x8c, l); } jc(Label * l)2258 void Assembler::jc (Label* l) { this->jump(0x82, l); } 2259 jmp(Label * l)2260 void Assembler::jmp(Label* l) { 2261 // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit. 2262 this->byte(0xe9); 2263 this->word(this->disp32(l)); 2264 } 2265 vpmovzxwd(Ymm dst,Operand src)2266 void Assembler::vpmovzxwd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x33, dst,src); } vpmovzxbd(Ymm dst,Operand src)2267 void Assembler::vpmovzxbd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x31, dst,src); } 2268 vmovq(Operand dst,Xmm src)2269 void Assembler::vmovq(Operand dst, Xmm src) { this->op(0x66,0x0f,0xd6, src,dst); } 2270 vmovd(Operand dst,Xmm src)2271 void Assembler::vmovd(Operand dst, Xmm src) { this->op(0x66,0x0f,0x7e, src,dst); } vmovd(Xmm dst,Operand src)2272 void Assembler::vmovd(Xmm dst, Operand src) { this->op(0x66,0x0f,0x6e, dst,src); } 2273 vpinsrd(Xmm dst,Xmm src,Operand y,int imm)2274 void Assembler::vpinsrd(Xmm dst, Xmm src, Operand y, int imm) { 2275 this->op(0x66,0x3a0f,0x22, dst,src,y); 2276 this->imm_byte_after_operand(y, imm); 2277 } vpinsrw(Xmm dst,Xmm src,Operand y,int imm)2278 void Assembler::vpinsrw(Xmm dst, Xmm src, Operand y, int imm) { 2279 this->op(0x66,0x0f,0xc4, dst,src,y); 2280 this->imm_byte_after_operand(y, imm); 2281 } vpinsrb(Xmm dst,Xmm src,Operand y,int imm)2282 void Assembler::vpinsrb(Xmm dst, Xmm src, Operand y, int imm) { 2283 this->op(0x66,0x3a0f,0x20, dst,src,y); 2284 this->imm_byte_after_operand(y, imm); 2285 } 2286 vextracti128(Operand dst,Ymm src,int imm)2287 void Assembler::vextracti128(Operand dst, Ymm src, int imm) { 2288 this->op(0x66,0x3a0f,0x39, src,dst); 2289 SkASSERT(dst.kind != Operand::LABEL); 2290 this->byte(imm); 2291 } vpextrd(Operand dst,Xmm src,int imm)2292 void Assembler::vpextrd(Operand dst, Xmm src, int imm) { 2293 this->op(0x66,0x3a0f,0x16, src,dst); 2294 SkASSERT(dst.kind != Operand::LABEL); 2295 this->byte(imm); 2296 } vpextrw(Operand dst,Xmm src,int imm)2297 void Assembler::vpextrw(Operand dst, Xmm src, int imm) { 2298 this->op(0x66,0x3a0f,0x15, src,dst); 2299 SkASSERT(dst.kind != Operand::LABEL); 2300 this->byte(imm); 2301 } vpextrb(Operand dst,Xmm src,int imm)2302 void Assembler::vpextrb(Operand dst, Xmm src, int imm) { 2303 this->op(0x66,0x3a0f,0x14, src,dst); 2304 SkASSERT(dst.kind != Operand::LABEL); 2305 this->byte(imm); 2306 } 2307 vgatherdps(Ymm dst,Scale scale,Ymm ix,GP64 base,Ymm mask)2308 void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) { 2309 // Unlike most instructions, no aliasing is permitted here. 2310 SkASSERT(dst != ix); 2311 SkASSERT(dst != mask); 2312 SkASSERT(mask != ix); 2313 2314 int prefix = 0x66, 2315 map = 0x380f, 2316 opcode = 0x92; 2317 VEX v = vex(0, dst>>3, ix>>3, base>>3, 2318 map, mask, /*ymm?*/1, prefix); 2319 this->bytes(v.bytes, v.len); 2320 this->byte(opcode); 2321 this->byte(mod_rm(Mod::Indirect, dst&7, rsp/*use SIB*/)); 2322 this->byte(sib(scale, ix&7, base&7)); 2323 } 2324 2325 // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf 2326 operator ""_mask(unsigned long long bits)2327 static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; } 2328 op(uint32_t hi,V m,uint32_t lo,V n,V d)2329 void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) { 2330 this->word( (hi & 11_mask) << 21 2331 | (m & 5_mask) << 16 2332 | (lo & 6_mask) << 10 2333 | (n & 5_mask) << 5 2334 | (d & 5_mask) << 0); 2335 } op(uint32_t op22,V n,V d,int imm)2336 void Assembler::op(uint32_t op22, V n, V d, int imm) { 2337 this->word( (op22 & 22_mask) << 10 2338 | imm // size and location depends on the instruction 2339 | (n & 5_mask) << 5 2340 | (d & 5_mask) << 0); 2341 } 2342 and16b(V d,V n,V m)2343 void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); } orr16b(V d,V n,V m)2344 void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); } eor16b(V d,V n,V m)2345 void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); } bic16b(V d,V n,V m)2346 void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); } bsl16b(V d,V n,V m)2347 void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); } not16b(V d,V n)2348 void Assembler::not16b(V d, V n) { this->op(0b0'1'1'01110'00'10000'00101'10, n, d); } 2349 add4s(V d,V n,V m)2350 void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); } sub4s(V d,V n,V m)2351 void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); } mul4s(V d,V n,V m)2352 void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); } 2353 cmeq4s(V d,V n,V m)2354 void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); } cmgt4s(V d,V n,V m)2355 void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); } 2356 sub8h(V d,V n,V m)2357 void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); } mul8h(V d,V n,V m)2358 void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); } 2359 fadd4s(V d,V n,V m)2360 void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); } fsub4s(V d,V n,V m)2361 void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); } fmul4s(V d,V n,V m)2362 void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); } fdiv4s(V d,V n,V m)2363 void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); } fmin4s(V d,V n,V m)2364 void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); } fmax4s(V d,V n,V m)2365 void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); } 2366 fneg4s(V d,V n)2367 void Assembler::fneg4s (V d, V n) { this->op(0b0'1'1'01110'1'0'10000'01111'10, n,d); } fsqrt4s(V d,V n)2368 void Assembler::fsqrt4s(V d, V n) { this->op(0b0'1'1'01110'1'0'10000'11111'10, n,d); } 2369 fcmeq4s(V d,V n,V m)2370 void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); } fcmgt4s(V d,V n,V m)2371 void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); } fcmge4s(V d,V n,V m)2372 void Assembler::fcmge4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b1110'0'1, n, d); } 2373 fmla4s(V d,V n,V m)2374 void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); } fmls4s(V d,V n,V m)2375 void Assembler::fmls4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11001'1, n, d); } 2376 tbl(V d,V n,V m)2377 void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); } 2378 uzp14s(V d,V n,V m)2379 void Assembler::uzp14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'01'10, n, d); } uzp24s(V d,V n,V m)2380 void Assembler::uzp24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'01'10, n, d); } zip14s(V d,V n,V m)2381 void Assembler::zip14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'11'10, n, d); } zip24s(V d,V n,V m)2382 void Assembler::zip24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'11'10, n, d); } 2383 sli4s(V d,V n,int imm5)2384 void Assembler::sli4s(V d, V n, int imm5) { 2385 this->op(0b0'1'1'011110'0100'000'01010'1, n, d, ( imm5 & 5_mask)<<16); 2386 } shl4s(V d,V n,int imm5)2387 void Assembler::shl4s(V d, V n, int imm5) { 2388 this->op(0b0'1'0'011110'0100'000'01010'1, n, d, ( imm5 & 5_mask)<<16); 2389 } sshr4s(V d,V n,int imm5)2390 void Assembler::sshr4s(V d, V n, int imm5) { 2391 this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16); 2392 } ushr4s(V d,V n,int imm5)2393 void Assembler::ushr4s(V d, V n, int imm5) { 2394 this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16); 2395 } ushr8h(V d,V n,int imm4)2396 void Assembler::ushr8h(V d, V n, int imm4) { 2397 this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, n, d, (-imm4 & 4_mask)<<16); 2398 } 2399 scvtf4s(V d,V n)2400 void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); } fcvtzs4s(V d,V n)2401 void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); } fcvtns4s(V d,V n)2402 void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); } frintp4s(V d,V n)2403 void Assembler::frintp4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1100'0'10, n,d); } frintm4s(V d,V n)2404 void Assembler::frintm4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1100'1'10, n,d); } 2405 fcvtn(V d,V n)2406 void Assembler::fcvtn(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10110'10, n,d); } fcvtl(V d,V n)2407 void Assembler::fcvtl(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10111'10, n,d); } 2408 xtns2h(V d,V n)2409 void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); } xtnh2b(V d,V n)2410 void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); } 2411 uxtlb2h(V d,V n)2412 void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); } uxtlh2s(V d,V n)2413 void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); } 2414 uminv4s(V d,V n)2415 void Assembler::uminv4s(V d, V n) { this->op(0b0'1'1'01110'10'11000'1'1010'10, n,d); } 2416 brk(int imm16)2417 void Assembler::brk(int imm16) { 2418 this->op(0b11010100'001'00000000000, (imm16 & 16_mask) << 5); 2419 } 2420 ret(X n)2421 void Assembler::ret(X n) { this->op(0b1101011'0'0'10'11111'0000'0'0, n, (X)0); } 2422 add(X d,X n,int imm12)2423 void Assembler::add(X d, X n, int imm12) { 2424 this->op(0b1'0'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10); 2425 } sub(X d,X n,int imm12)2426 void Assembler::sub(X d, X n, int imm12) { 2427 this->op(0b1'1'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10); 2428 } subs(X d,X n,int imm12)2429 void Assembler::subs(X d, X n, int imm12) { 2430 this->op(0b1'1'1'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10); 2431 } 2432 add(X d,X n,X m,Shift shift,int imm6)2433 void Assembler::add(X d, X n, X m, Shift shift, int imm6) { 2434 SkASSERT(shift != ROR); 2435 2436 int imm = (imm6 & 6_mask) << 0 2437 | (m & 5_mask) << 6 2438 | (0 & 1_mask) << 11 2439 | (shift & 2_mask) << 12; 2440 this->op(0b1'0'0'01011'00'0'00000'000000, n,d, imm << 10); 2441 } 2442 b(Condition cond,Label * l)2443 void Assembler::b(Condition cond, Label* l) { 2444 const int imm19 = this->disp19(l); 2445 this->op(0b0101010'0'00000000000000, (X)0, (V)cond, (imm19 & 19_mask) << 5); 2446 } cbz(X t,Label * l)2447 void Assembler::cbz(X t, Label* l) { 2448 const int imm19 = this->disp19(l); 2449 this->op(0b1'011010'0'00000000000000, (X)0, t, (imm19 & 19_mask) << 5); 2450 } cbnz(X t,Label * l)2451 void Assembler::cbnz(X t, Label* l) { 2452 const int imm19 = this->disp19(l); 2453 this->op(0b1'011010'1'00000000000000, (X)0, t, (imm19 & 19_mask) << 5); 2454 } 2455 ldrd(X dst,X src,int imm12)2456 void Assembler::ldrd(X dst, X src, int imm12) { 2457 this->op(0b11'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2458 } ldrs(X dst,X src,int imm12)2459 void Assembler::ldrs(X dst, X src, int imm12) { 2460 this->op(0b10'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2461 } ldrh(X dst,X src,int imm12)2462 void Assembler::ldrh(X dst, X src, int imm12) { 2463 this->op(0b01'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2464 } ldrb(X dst,X src,int imm12)2465 void Assembler::ldrb(X dst, X src, int imm12) { 2466 this->op(0b00'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2467 } 2468 ldrq(V dst,X src,int imm12)2469 void Assembler::ldrq(V dst, X src, int imm12) { 2470 this->op(0b00'111'1'01'11'000000000000, src, dst, (imm12 & 12_mask) << 10); 2471 } ldrd(V dst,X src,int imm12)2472 void Assembler::ldrd(V dst, X src, int imm12) { 2473 this->op(0b11'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2474 } ldrs(V dst,X src,int imm12)2475 void Assembler::ldrs(V dst, X src, int imm12) { 2476 this->op(0b10'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2477 } ldrh(V dst,X src,int imm12)2478 void Assembler::ldrh(V dst, X src, int imm12) { 2479 this->op(0b01'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2480 } ldrb(V dst,X src,int imm12)2481 void Assembler::ldrb(V dst, X src, int imm12) { 2482 this->op(0b00'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2483 } 2484 strs(X src,X dst,int imm12)2485 void Assembler::strs(X src, X dst, int imm12) { 2486 this->op(0b10'111'0'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); 2487 } 2488 strq(V src,X dst,int imm12)2489 void Assembler::strq(V src, X dst, int imm12) { 2490 this->op(0b00'111'1'01'10'000000000000, dst, src, (imm12 & 12_mask) << 10); 2491 } strd(V src,X dst,int imm12)2492 void Assembler::strd(V src, X dst, int imm12) { 2493 this->op(0b11'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); 2494 } strs(V src,X dst,int imm12)2495 void Assembler::strs(V src, X dst, int imm12) { 2496 this->op(0b10'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); 2497 } strh(V src,X dst,int imm12)2498 void Assembler::strh(V src, X dst, int imm12) { 2499 this->op(0b01'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); 2500 } strb(V src,X dst,int imm12)2501 void Assembler::strb(V src, X dst, int imm12) { 2502 this->op(0b00'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); 2503 } 2504 movs(X dst,V src,int lane)2505 void Assembler::movs(X dst, V src, int lane) { 2506 int imm5 = (lane << 3) | 0b100; 2507 this->op(0b0'0'0'01110000'00000'0'01'1'1'1, src, dst, (imm5 & 5_mask) << 16); 2508 } inss(V dst,X src,int lane)2509 void Assembler::inss(V dst, X src, int lane) { 2510 int imm5 = (lane << 3) | 0b100; 2511 this->op(0b0'1'0'01110000'00000'0'0011'1, src, dst, (imm5 & 5_mask) << 16); 2512 } 2513 2514 ldrq(V dst,Label * l)2515 void Assembler::ldrq(V dst, Label* l) { 2516 const int imm19 = this->disp19(l); 2517 this->op(0b10'011'1'00'00000000000000, (V)0, dst, (imm19 & 19_mask) << 5); 2518 } 2519 dup4s(V dst,X src)2520 void Assembler::dup4s(V dst, X src) { 2521 this->op(0b0'1'0'01110000'00100'0'0001'1, src, dst); 2522 } 2523 ld1r4s(V dst,X src)2524 void Assembler::ld1r4s(V dst, X src) { 2525 this->op(0b0'1'0011010'1'0'00000'110'0'10, src, dst); 2526 } ld1r8h(V dst,X src)2527 void Assembler::ld1r8h(V dst, X src) { 2528 this->op(0b0'1'0011010'1'0'00000'110'0'01, src, dst); 2529 } ld1r16b(V dst,X src)2530 void Assembler::ld1r16b(V dst, X src) { 2531 this->op(0b0'1'0011010'1'0'00000'110'0'00, src, dst); 2532 } 2533 ld24s(V dst,X src)2534 void Assembler::ld24s(V dst, X src) { this->op(0b0'1'0011000'1'000000'1000'10, src, dst); } ld44s(V dst,X src)2535 void Assembler::ld44s(V dst, X src) { this->op(0b0'1'0011000'1'000000'0000'10, src, dst); } st24s(V src,X dst)2536 void Assembler::st24s(V src, X dst) { this->op(0b0'1'0011000'0'000000'1000'10, dst, src); } st44s(V src,X dst)2537 void Assembler::st44s(V src, X dst) { this->op(0b0'1'0011000'0'000000'0000'10, dst, src); } 2538 ld24s(V dst,X src,int lane)2539 void Assembler::ld24s(V dst, X src, int lane) { 2540 int Q = (lane & 2)>>1, 2541 S = (lane & 1); 2542 /* Q S */ 2543 this->op(0b0'0'0011010'1'1'00000'100'0'00, src, dst, (Q<<30)|(S<<12)); 2544 } ld44s(V dst,X src,int lane)2545 void Assembler::ld44s(V dst, X src, int lane) { 2546 int Q = (lane & 2)>>1, 2547 S = (lane & 1); 2548 this->op(0b0'0'0011010'1'1'00000'101'0'00, src, dst, (Q<<30)|(S<<12)); 2549 } 2550 label(Label * l)2551 void Assembler::label(Label* l) { 2552 if (fCode) { 2553 // The instructions all currently point to l->offset. 2554 // We'll want to add a delta to point them to here. 2555 int here = (int)this->size(); 2556 int delta = here - l->offset; 2557 l->offset = here; 2558 2559 if (l->kind == Label::ARMDisp19) { 2560 for (int ref : l->references) { 2561 // ref points to a 32-bit instruction with 19-bit displacement in instructions. 2562 uint32_t inst; 2563 memcpy(&inst, fCode + ref, 4); 2564 2565 // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ] 2566 int disp = (int)(inst << 8) >> 13; 2567 2568 disp += delta/4; // delta is in bytes, we want instructions. 2569 2570 // Put it all back together, preserving the high 8 bits and low 5. 2571 inst = ((disp << 5) & (19_mask << 5)) 2572 | ((inst ) & ~(19_mask << 5)); 2573 memcpy(fCode + ref, &inst, 4); 2574 } 2575 } 2576 2577 if (l->kind == Label::X86Disp32) { 2578 for (int ref : l->references) { 2579 // ref points to a 32-bit displacement in bytes. 2580 int disp; 2581 memcpy(&disp, fCode + ref, 4); 2582 2583 disp += delta; 2584 2585 memcpy(fCode + ref, &disp, 4); 2586 } 2587 } 2588 } 2589 } 2590 eval(int n,void * args[]) const2591 void Program::eval(int n, void* args[]) const { 2592 #define SKVM_JIT_STATS 0 2593 #if SKVM_JIT_STATS 2594 static std::atomic<int64_t> calls{0}, jits{0}, 2595 pixels{0}, fast{0}; 2596 pixels += n; 2597 if (0 == calls++) { 2598 atexit([]{ 2599 int64_t num = jits .load(), 2600 den = calls.load(); 2601 SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n", (100.0 * num)/den, den); 2602 num = fast .load(); 2603 den = pixels.load(); 2604 SkDebugf("%.3g%% of %lld pixels went through JIT.\n", (100.0 * num)/den, den); 2605 }); 2606 } 2607 #endif 2608 2609 #if !defined(SKVM_JIT_BUT_IGNORE_IT) 2610 const void* jit_entry = fImpl->jit_entry.load(); 2611 // jit_entry may be null either simply because we can't JIT, or when using LLVM 2612 // if the work represented by fImpl->llvm_compiling hasn't finished yet. 2613 // 2614 // Ordinarily we'd never find ourselves with non-null jit_entry and !gSkVMAllowJIT, but it 2615 // can happen during interactive programs like Viewer that toggle gSkVMAllowJIT on and off, 2616 // due to timing or program caching. 2617 if (jit_entry != nullptr && gSkVMAllowJIT) { 2618 #if SKVM_JIT_STATS 2619 jits++; 2620 fast += n; 2621 #endif 2622 void** a = args; 2623 switch (fImpl->strides.size()) { 2624 case 0: return ((void(*)(int ))jit_entry)(n ); 2625 case 1: return ((void(*)(int,void* ))jit_entry)(n,a[0] ); 2626 case 2: return ((void(*)(int,void*,void* ))jit_entry)(n,a[0],a[1] ); 2627 case 3: return ((void(*)(int,void*,void*,void* ))jit_entry)(n,a[0],a[1],a[2]); 2628 case 4: return ((void(*)(int,void*,void*,void*,void*))jit_entry) 2629 (n,a[0],a[1],a[2],a[3]); 2630 case 5: return ((void(*)(int,void*,void*,void*,void*,void*))jit_entry) 2631 (n,a[0],a[1],a[2],a[3],a[4]); 2632 case 6: return ((void(*)(int,void*,void*,void*,void*,void*,void*))jit_entry) 2633 (n,a[0],a[1],a[2],a[3],a[4],a[5]); 2634 case 7: return ((void(*)(int,void*,void*,void*,void*,void*,void*,void*))jit_entry) 2635 (n,a[0],a[1],a[2],a[3],a[4],a[5],a[6]); 2636 default: break; //SkASSERT(fImpl->strides.size() <= 7); 2637 } 2638 } 2639 #endif 2640 2641 // So we'll sometimes use the interpreter here even if later calls will use the JIT. 2642 SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(), 2643 this->nregs(), this->loop(), fImpl->strides.data(), 2644 fImpl->traceHooks.data(), fImpl->traceHooks.size(), 2645 this->nargs(), n, args); 2646 } 2647 2648 #if defined(SKVM_LLVM) 2649 // -- SKVM_LLVM -------------------------------------------------------------------------------- setupLLVM(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)2650 void Program::setupLLVM(const std::vector<OptimizedInstruction>& instructions, 2651 const char* debug_name) { 2652 auto ctx = std::make_unique<llvm::LLVMContext>(); 2653 2654 auto mod = std::make_unique<llvm::Module>("", *ctx); 2655 // All the scary bare pointers from here on are owned by ctx or mod, I think. 2656 2657 // Everything I've tested runs faster at K=8 (using ymm) than K=16 (zmm) on SKX machines. 2658 const int K = (true && SkCpu::Supports(SkCpu::HSW)) ? 8 : 4; 2659 2660 llvm::Type *ptr = llvm::Type::getInt8Ty(*ctx)->getPointerTo(), 2661 *i32 = llvm::Type::getInt32Ty(*ctx); 2662 2663 std::vector<llvm::Type*> arg_types = { i32 }; 2664 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2665 arg_types.push_back(ptr); 2666 } 2667 2668 llvm::FunctionType* fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*ctx), 2669 arg_types, /*vararg?=*/false); 2670 llvm::Function* fn 2671 = llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, debug_name, *mod); 2672 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2673 fn->addParamAttr(i+1, llvm::Attribute::NoAlias); 2674 } 2675 2676 llvm::BasicBlock *enter = llvm::BasicBlock::Create(*ctx, "enter" , fn), 2677 *hoistK = llvm::BasicBlock::Create(*ctx, "hoistK", fn), 2678 *testK = llvm::BasicBlock::Create(*ctx, "testK" , fn), 2679 *loopK = llvm::BasicBlock::Create(*ctx, "loopK" , fn), 2680 *hoist1 = llvm::BasicBlock::Create(*ctx, "hoist1", fn), 2681 *test1 = llvm::BasicBlock::Create(*ctx, "test1" , fn), 2682 *loop1 = llvm::BasicBlock::Create(*ctx, "loop1" , fn), 2683 *leave = llvm::BasicBlock::Create(*ctx, "leave" , fn); 2684 2685 using IRBuilder = llvm::IRBuilder<>; 2686 2687 llvm::PHINode* n; 2688 std::vector<llvm::PHINode*> args; 2689 std::vector<llvm::Value*> vals(instructions.size()); 2690 2691 auto emit = [&](size_t i, bool scalar, IRBuilder* b) { 2692 auto [op, x,y,z,w, immA,immB,immC, death,can_hoist] = instructions[i]; 2693 2694 llvm::Type *i1 = llvm::Type::getInt1Ty (*ctx), 2695 *i8 = llvm::Type::getInt8Ty (*ctx), 2696 *i16 = llvm::Type::getInt16Ty(*ctx), 2697 *f32 = llvm::Type::getFloatTy(*ctx), 2698 *I1 = scalar ? i1 : llvm::VectorType::get(i1 , K, false ), 2699 *I8 = scalar ? i8 : llvm::VectorType::get(i8 , K, false ), 2700 *I16 = scalar ? i16 : llvm::VectorType::get(i16, K, false ), 2701 *I32 = scalar ? i32 : llvm::VectorType::get(i32, K, false ), 2702 *F32 = scalar ? f32 : llvm::VectorType::get(f32, K, false ); 2703 2704 auto I = [&](llvm::Value* v) { return b->CreateBitCast(v, I32 ); }; 2705 auto F = [&](llvm::Value* v) { return b->CreateBitCast(v, F32 ); }; 2706 2707 auto S = [&](llvm::Type* dst, llvm::Value* v) { return b->CreateSExt(v, dst); }; 2708 2709 llvm::Type* vt = nullptr; 2710 switch (llvm::Type* t = nullptr; op) { 2711 default: 2712 SkDebugf("can't llvm %s (%d)\n", name(op), op); 2713 return false; 2714 2715 case Op::assert_true: /*TODO*/ break; 2716 2717 case Op::trace_line: 2718 case Op::trace_var: 2719 case Op::trace_enter: 2720 case Op::trace_exit: 2721 case Op::trace_scope: 2722 /* Force this program to run in the interpreter. */ 2723 return false; 2724 2725 case Op::index: 2726 if (I32->isVectorTy()) { 2727 std::vector<llvm::Constant*> iota(K); 2728 for (int j = 0; j < K; j++) { 2729 iota[j] = b->getInt32(j); 2730 } 2731 vals[i] = b->CreateSub(b->CreateVectorSplat(K, n), 2732 llvm::ConstantVector::get(iota)); 2733 } else { 2734 vals[i] = n; 2735 } break; 2736 2737 case Op::load8: t = I8 ; goto load; 2738 case Op::load16: t = I16; goto load; 2739 case Op::load32: t = I32; goto load; 2740 load: { 2741 llvm::Value* ptr = b->CreateBitCast(args[immA], t->getPointerTo()); 2742 vals[i] = b->CreateZExt( 2743 b->CreateAlignedLoad(t, ptr, llvm::MaybeAlign{1}), I32); 2744 } break; 2745 2746 2747 case Op::splat: vals[i] = llvm::ConstantInt::get(I32, immA); break; 2748 2749 case Op::uniform32: { 2750 llvm::Value* ptr = b->CreateBitCast( 2751 b->CreateConstInBoundsGEP1_32(i8, args[immA], immB), 2752 i32->getPointerTo()); 2753 llvm::Value* val = b->CreateZExt( 2754 b->CreateAlignedLoad(i32, ptr, llvm::MaybeAlign{1}), i32); 2755 vals[i] = I32->isVectorTy() ? b->CreateVectorSplat(K, val) 2756 : val; 2757 } break; 2758 2759 case Op::gather8: t = i8 ; vt = I8; goto gather; 2760 case Op::gather16: t = i16; vt = I16; goto gather; 2761 case Op::gather32: t = i32; vt = I32; goto gather; 2762 gather: { 2763 // Our gather base pointer is immB bytes off of uniform immA. 2764 llvm::Value* base = 2765 b->CreateLoad(b->CreateBitCast( 2766 b->CreateConstInBoundsGEP1_32(i8, args[immA],immB), 2767 t->getPointerTo()->getPointerTo())); 2768 2769 llvm::Value* ptr = b->CreateInBoundsGEP(t, base, vals[x]); 2770 llvm::Value* gathered; 2771 if (ptr->getType()->isVectorTy()) { 2772 gathered = b->CreateMaskedGather( 2773 vt, 2774 ptr, 2775 llvm::Align{1}); 2776 } else { 2777 gathered = b->CreateAlignedLoad(vt, ptr, llvm::MaybeAlign{1}); 2778 } 2779 vals[i] = b->CreateZExt(gathered, I32); 2780 } break; 2781 2782 case Op::store8: t = I8 ; goto store; 2783 case Op::store16: t = I16; goto store; 2784 case Op::store32: t = I32; goto store; 2785 store: { 2786 llvm::Value* val = b->CreateTrunc(vals[x], t); 2787 llvm::Value* ptr = b->CreateBitCast(args[immA], 2788 val->getType()->getPointerTo()); 2789 vals[i] = b->CreateAlignedStore(val, ptr, llvm::MaybeAlign{1}); 2790 } break; 2791 2792 case Op::bit_and: vals[i] = b->CreateAnd(vals[x], vals[y]); break; 2793 case Op::bit_or : vals[i] = b->CreateOr (vals[x], vals[y]); break; 2794 case Op::bit_xor: vals[i] = b->CreateXor(vals[x], vals[y]); break; 2795 case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break; 2796 2797 case Op::select: 2798 vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]); 2799 break; 2800 2801 case Op::add_i32: vals[i] = b->CreateAdd(vals[x], vals[y]); break; 2802 case Op::sub_i32: vals[i] = b->CreateSub(vals[x], vals[y]); break; 2803 case Op::mul_i32: vals[i] = b->CreateMul(vals[x], vals[y]); break; 2804 2805 case Op::shl_i32: vals[i] = b->CreateShl (vals[x], immA); break; 2806 case Op::sra_i32: vals[i] = b->CreateAShr(vals[x], immA); break; 2807 case Op::shr_i32: vals[i] = b->CreateLShr(vals[x], immA); break; 2808 2809 case Op:: eq_i32: vals[i] = S(I32, b->CreateICmpEQ (vals[x], vals[y])); break; 2810 case Op:: gt_i32: vals[i] = S(I32, b->CreateICmpSGT(vals[x], vals[y])); break; 2811 2812 case Op::add_f32: vals[i] = I(b->CreateFAdd(F(vals[x]), F(vals[y]))); break; 2813 case Op::sub_f32: vals[i] = I(b->CreateFSub(F(vals[x]), F(vals[y]))); break; 2814 case Op::mul_f32: vals[i] = I(b->CreateFMul(F(vals[x]), F(vals[y]))); break; 2815 case Op::div_f32: vals[i] = I(b->CreateFDiv(F(vals[x]), F(vals[y]))); break; 2816 2817 case Op:: eq_f32: vals[i] = S(I32, b->CreateFCmpOEQ(F(vals[x]), F(vals[y]))); break; 2818 case Op::neq_f32: vals[i] = S(I32, b->CreateFCmpUNE(F(vals[x]), F(vals[y]))); break; 2819 case Op:: gt_f32: vals[i] = S(I32, b->CreateFCmpOGT(F(vals[x]), F(vals[y]))); break; 2820 case Op::gte_f32: vals[i] = S(I32, b->CreateFCmpOGE(F(vals[x]), F(vals[y]))); break; 2821 2822 case Op::fma_f32: 2823 vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, 2824 {F(vals[x]), F(vals[y]), F(vals[z])})); 2825 break; 2826 2827 case Op::fms_f32: 2828 vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, 2829 {F(vals[x]), F(vals[y]), 2830 b->CreateFNeg(F(vals[z]))})); 2831 break; 2832 2833 case Op::fnma_f32: 2834 vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, 2835 {b->CreateFNeg(F(vals[x])), F(vals[y]), 2836 F(vals[z])})); 2837 break; 2838 2839 case Op::ceil: 2840 vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::ceil, F(vals[x]))); 2841 break; 2842 case Op::floor: 2843 vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::floor, F(vals[x]))); 2844 break; 2845 2846 case Op::max_f32: 2847 vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[x]), F(vals[y])), 2848 F(vals[y]), F(vals[x]))); 2849 break; 2850 case Op::min_f32: 2851 vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[y]), F(vals[x])), 2852 F(vals[y]), F(vals[x]))); 2853 break; 2854 2855 case Op::sqrt_f32: 2856 vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, F(vals[x]))); 2857 break; 2858 2859 case Op::to_f32: vals[i] = I(b->CreateSIToFP( vals[x] , F32)); break; 2860 case Op::trunc : vals[i] = b->CreateFPToSI(F(vals[x]), I32) ; break; 2861 case Op::round : { 2862 // Basic impl when we can't use cvtps2dq and co. 2863 auto round = b->CreateUnaryIntrinsic(llvm::Intrinsic::rint, F(vals[x])); 2864 vals[i] = b->CreateFPToSI(round, I32); 2865 2866 #if 1 && defined(SK_CPU_X86) 2867 // Using b->CreateIntrinsic(..., {}, {...}) to avoid name mangling. 2868 if (scalar) { 2869 // cvtss2si is float x4 -> int, ignoring input lanes 1,2,3. ¯\_(ツ)_/¯ 2870 llvm::Value* v = llvm::UndefValue::get( 2871 llvm::VectorType::get(f32, 4, false)); 2872 v = b->CreateInsertElement(v, F(vals[x]), (uint64_t)0); 2873 vals[i] = b->CreateIntrinsic(llvm::Intrinsic::x86_sse_cvtss2si, {}, {v}); 2874 } else { 2875 SkASSERT(K == 4 || K == 8); 2876 auto intr = K == 4 ? llvm::Intrinsic::x86_sse2_cvtps2dq : 2877 /* K == 8 ?*/ llvm::Intrinsic::x86_avx_cvt_ps2dq_256; 2878 vals[i] = b->CreateIntrinsic(intr, {}, {F(vals[x])}); 2879 } 2880 #endif 2881 } break; 2882 2883 } 2884 return true; 2885 }; 2886 2887 { 2888 IRBuilder b(enter); 2889 b.CreateBr(hoistK); 2890 } 2891 2892 // hoistK: emit each hoistable vector instruction; goto testK; 2893 // LLVM can do this sort of thing itself, but we've got the information cheap, 2894 // and pointer aliasing makes it easier to manually hoist than teach LLVM it's safe. 2895 { 2896 IRBuilder b(hoistK); 2897 2898 // Hoisted instructions will need args (think, uniforms), so set that up now. 2899 // These phi nodes are degenerate... they'll always be the passed-in args from enter. 2900 // Later on when we start looping the phi nodes will start looking useful. 2901 llvm::Argument* arg = fn->arg_begin(); 2902 (void)arg++; // Leave n as nullptr... it'd be a bug to use n in a hoisted instruction. 2903 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2904 args.push_back(b.CreatePHI(arg->getType(), 1)); 2905 args.back()->addIncoming(arg++, enter); 2906 } 2907 2908 for (size_t i = 0; i < instructions.size(); i++) { 2909 if (instructions[i].can_hoist && !emit(i, false, &b)) { 2910 return; 2911 } 2912 } 2913 2914 b.CreateBr(testK); 2915 } 2916 2917 // testK: if (N >= K) goto loopK; else goto hoist1; 2918 { 2919 IRBuilder b(testK); 2920 2921 // New phi nodes for `n` and each pointer argument from hoistK; later we'll add loopK. 2922 // These also start as the initial function arguments; hoistK can't have changed them. 2923 llvm::Argument* arg = fn->arg_begin(); 2924 2925 n = b.CreatePHI(arg->getType(), 2); 2926 n->addIncoming(arg++, hoistK); 2927 2928 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2929 args[i] = b.CreatePHI(arg->getType(), 2); 2930 args[i]->addIncoming(arg++, hoistK); 2931 } 2932 2933 b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(K)), loopK, hoist1); 2934 } 2935 2936 // loopK: ... insts on K x T vectors; N -= K, args += K*stride; goto testK; 2937 { 2938 IRBuilder b(loopK); 2939 for (size_t i = 0; i < instructions.size(); i++) { 2940 if (!instructions[i].can_hoist && !emit(i, false, &b)) { 2941 return; 2942 } 2943 } 2944 2945 // n -= K 2946 llvm::Value* n_next = b.CreateSub(n, b.getInt32(K)); 2947 n->addIncoming(n_next, loopK); 2948 2949 // Each arg ptr += K 2950 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2951 llvm::Value* arg_next 2952 = b.CreateConstInBoundsGEP1_32( 2953 llvm::Type::getInt8Ty (*ctx), 2954 args[i], 2955 K*fImpl->strides[i]); 2956 args[i]->addIncoming(arg_next, loopK); 2957 } 2958 b.CreateBr(testK); 2959 } 2960 2961 // hoist1: emit each hoistable scalar instruction; goto test1; 2962 { 2963 IRBuilder b(hoist1); 2964 for (size_t i = 0; i < instructions.size(); i++) { 2965 if (instructions[i].can_hoist && !emit(i, true, &b)) { 2966 return; 2967 } 2968 } 2969 b.CreateBr(test1); 2970 } 2971 2972 // test1: if (N >= 1) goto loop1; else goto leave; 2973 { 2974 IRBuilder b(test1); 2975 2976 // Set up new phi nodes for `n` and each pointer argument, now from hoist1 and loop1. 2977 llvm::PHINode* n_new = b.CreatePHI(n->getType(), 2); 2978 n_new->addIncoming(n, hoist1); 2979 n = n_new; 2980 2981 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2982 llvm::PHINode* arg_new = b.CreatePHI(args[i]->getType(), 2); 2983 arg_new->addIncoming(args[i], hoist1); 2984 args[i] = arg_new; 2985 } 2986 2987 b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(1)), loop1, leave); 2988 } 2989 2990 // loop1: ... insts on scalars; N -= 1, args += stride; goto test1; 2991 { 2992 IRBuilder b(loop1); 2993 for (size_t i = 0; i < instructions.size(); i++) { 2994 if (!instructions[i].can_hoist && !emit(i, true, &b)) { 2995 return; 2996 } 2997 } 2998 2999 // n -= 1 3000 llvm::Value* n_next = b.CreateSub(n, b.getInt32(1)); 3001 n->addIncoming(n_next, loop1); 3002 3003 // Each arg ptr += 1 3004 for (size_t i = 0; i < fImpl->strides.size(); i++) { 3005 llvm::Value* arg_next 3006 = b.CreateConstInBoundsGEP1_32( 3007 llvm::Type::getInt8Ty (*ctx), args[i], fImpl->strides[i]); 3008 args[i]->addIncoming(arg_next, loop1); 3009 } 3010 b.CreateBr(test1); 3011 } 3012 3013 // leave: ret 3014 { 3015 IRBuilder b(leave); 3016 b.CreateRetVoid(); 3017 } 3018 3019 SkASSERT(false == llvm::verifyModule(*mod, &llvm::outs())); 3020 3021 if (true) { 3022 SkString path = SkStringPrintf("/tmp/%s.bc", debug_name); 3023 std::error_code err; 3024 llvm::raw_fd_ostream os(path.c_str(), err); 3025 if (err) { 3026 return; 3027 } 3028 llvm::WriteBitcodeToFile(*mod, os); 3029 } 3030 3031 static SkOnce once; 3032 once([]{ 3033 SkAssertResult(false == llvm::InitializeNativeTarget()); 3034 SkAssertResult(false == llvm::InitializeNativeTargetAsmPrinter()); 3035 }); 3036 3037 if (llvm::ExecutionEngine* ee = llvm::EngineBuilder(std::move(mod)) 3038 .setEngineKind(llvm::EngineKind::JIT) 3039 .setMCPU(llvm::sys::getHostCPUName()) 3040 .create()) { 3041 fImpl->llvm_ctx = std::move(ctx); 3042 fImpl->llvm_ee.reset(ee); 3043 3044 #if defined(SKVM_LLVM_WAIT_FOR_COMPILATION) 3045 // Wait for llvm to compile 3046 void* function = (void*)ee->getFunctionAddress(debug_name); 3047 fImpl->jit_entry.store(function); 3048 // We have to be careful here about what we close over and how, in case fImpl moves. 3049 // fImpl itself may change, but its pointee fields won't, so close over them by value. 3050 // Also, debug_name will almost certainly leave scope, so copy it. 3051 #else 3052 fImpl->llvm_compiling = std::async(std::launch::async, [dst = &fImpl->jit_entry, 3053 ee = fImpl->llvm_ee.get(), 3054 name = std::string(debug_name)]{ 3055 // std::atomic<void*>* dst; 3056 // llvm::ExecutionEngine* ee; 3057 // std::string name; 3058 dst->store( (void*)ee->getFunctionAddress(name.c_str()) ); 3059 }); 3060 #endif 3061 } 3062 } 3063 #endif // SKVM_LLVM 3064 waitForLLVM() const3065 void Program::waitForLLVM() const { 3066 #if defined(SKVM_LLVM) && !defined(SKVM_LLVM_WAIT_FOR_COMPILATION) 3067 if (fImpl->llvm_compiling.valid()) { 3068 fImpl->llvm_compiling.wait(); 3069 } 3070 #endif 3071 } 3072 hasTraceHooks() const3073 bool Program::hasTraceHooks() const { 3074 // Identifies a program which has been instrumented for debugging. 3075 return !fImpl->traceHooks.empty(); 3076 } 3077 hasJIT() const3078 bool Program::hasJIT() const { 3079 // Program::hasJIT() is really just a debugging / test aid, 3080 // so we don't mind adding a sync point here to wait for compilation. 3081 this->waitForLLVM(); 3082 3083 return fImpl->jit_entry.load() != nullptr; 3084 } 3085 dropJIT()3086 void Program::dropJIT() { 3087 #if defined(SKVM_LLVM) 3088 this->waitForLLVM(); 3089 fImpl->llvm_ee .reset(nullptr); 3090 fImpl->llvm_ctx.reset(nullptr); 3091 #elif defined(SKVM_JIT) 3092 if (fImpl->dylib) { 3093 close_dylib(fImpl->dylib); 3094 } else if (auto jit_entry = fImpl->jit_entry.load()) { 3095 unmap_jit_buffer(jit_entry, fImpl->jit_size); 3096 } 3097 #else 3098 SkASSERT(!this->hasJIT()); 3099 #endif 3100 3101 fImpl->jit_entry.store(nullptr); 3102 fImpl->jit_size = 0; 3103 fImpl->dylib = nullptr; 3104 } 3105 Program()3106 Program::Program() : fImpl(std::make_unique<Impl>()) {} 3107 ~Program()3108 Program::~Program() { 3109 // Moved-from Programs may have fImpl == nullptr. 3110 if (fImpl) { 3111 this->dropJIT(); 3112 } 3113 } 3114 Program(Program && other)3115 Program::Program(Program&& other) : fImpl(std::move(other.fImpl)) {} 3116 operator =(Program && other)3117 Program& Program::operator=(Program&& other) { 3118 fImpl = std::move(other.fImpl); 3119 return *this; 3120 } 3121 Program(const std::vector<OptimizedInstruction> & instructions,std::unique_ptr<viz::Visualizer> visualizer,const std::vector<int> & strides,const std::vector<TraceHook * > & traceHooks,const char * debug_name,bool allow_jit)3122 Program::Program(const std::vector<OptimizedInstruction>& instructions, 3123 std::unique_ptr<viz::Visualizer> visualizer, 3124 const std::vector<int>& strides, 3125 const std::vector<TraceHook*>& traceHooks, 3126 const char* debug_name, bool allow_jit) : Program() { 3127 fImpl->visualizer = std::move(visualizer); 3128 fImpl->strides = strides; 3129 fImpl->traceHooks = traceHooks; 3130 if (gSkVMAllowJIT && allow_jit) { 3131 #if 1 && defined(SKVM_LLVM) 3132 this->setupLLVM(instructions, debug_name); 3133 #elif 1 && defined(SKVM_JIT) 3134 this->setupJIT(instructions, debug_name); 3135 #endif 3136 } 3137 3138 // Might as well do this after setupLLVM() to get a little more time to compile. 3139 this->setupInterpreter(instructions); 3140 } 3141 instructions() const3142 std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; } nargs() const3143 int Program::nargs() const { return (int)fImpl->strides.size(); } nregs() const3144 int Program::nregs() const { return fImpl->regs; } loop() const3145 int Program::loop () const { return fImpl->loop; } empty() const3146 bool Program::empty() const { return fImpl->instructions.empty(); } 3147 3148 // Translate OptimizedInstructions to InterpreterInstructions. setupInterpreter(const std::vector<OptimizedInstruction> & instructions)3149 void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) { 3150 // Register each instruction is assigned to. 3151 std::vector<Reg> reg(instructions.size()); 3152 3153 // This next bit is a bit more complicated than strictly necessary; 3154 // we could just assign every instruction to its own register. 3155 // 3156 // But recycling registers is fairly cheap, and good practice for the 3157 // JITs where minimizing register pressure really is important. 3158 // 3159 // We have effectively infinite registers, so we hoist any value we can. 3160 // (The JIT may choose a more complex policy to reduce register pressure.) 3161 3162 fImpl->regs = 0; 3163 std::vector<Reg> avail; 3164 3165 // Assign this value to a register, recycling them where we can. 3166 auto assign_register = [&](Val id) { 3167 const OptimizedInstruction& inst = instructions[id]; 3168 3169 // If this is a real input and it's lifetime ends at this instruction, 3170 // we can recycle the register it's occupying. 3171 auto maybe_recycle_register = [&](Val input) { 3172 if (input != NA && instructions[input].death == id) { 3173 avail.push_back(reg[input]); 3174 } 3175 }; 3176 3177 // Take care to not recycle the same register twice. 3178 const Val x = inst.x, y = inst.y, z = inst.z, w = inst.w; 3179 if (true ) { maybe_recycle_register(x); } 3180 if (y != x ) { maybe_recycle_register(y); } 3181 if (z != x && z != y ) { maybe_recycle_register(z); } 3182 if (w != x && w != y && w != z) { maybe_recycle_register(w); } 3183 3184 // Instructions that die at themselves (stores) don't need a register. 3185 if (inst.death != id) { 3186 // Allocate a register if we have to, preferring to reuse anything available. 3187 if (avail.empty()) { 3188 reg[id] = fImpl->regs++; 3189 } else { 3190 reg[id] = avail.back(); 3191 avail.pop_back(); 3192 } 3193 } 3194 }; 3195 3196 // Assign a register to each hoisted instruction, then each non-hoisted loop instruction. 3197 for (Val id = 0; id < (Val)instructions.size(); id++) { 3198 if ( instructions[id].can_hoist) { assign_register(id); } 3199 } 3200 for (Val id = 0; id < (Val)instructions.size(); id++) { 3201 if (!instructions[id].can_hoist) { assign_register(id); } 3202 } 3203 3204 // Translate OptimizedInstructions to InterpreterIstructions by mapping values to 3205 // registers. This will be two passes, first hoisted instructions, then inside the loop. 3206 3207 // The loop begins at the fImpl->loop'th Instruction. 3208 fImpl->loop = 0; 3209 fImpl->instructions.reserve(instructions.size()); 3210 3211 // Add a mapping for the N/A sentinel Val to any arbitrary register 3212 // so lookups don't have to know which arguments are used by which Ops. 3213 auto lookup_register = [&](Val id) { 3214 return id == NA ? (Reg)0 3215 : reg[id]; 3216 }; 3217 3218 auto push_instruction = [&](Val id, const OptimizedInstruction& inst) { 3219 InterpreterInstruction pinst{ 3220 inst.op, 3221 lookup_register(id), 3222 lookup_register(inst.x), 3223 lookup_register(inst.y), 3224 lookup_register(inst.z), 3225 lookup_register(inst.w), 3226 inst.immA, 3227 inst.immB, 3228 inst.immC, 3229 }; 3230 fImpl->instructions.push_back(pinst); 3231 }; 3232 3233 for (Val id = 0; id < (Val)instructions.size(); id++) { 3234 const OptimizedInstruction& inst = instructions[id]; 3235 if (inst.can_hoist) { 3236 push_instruction(id, inst); 3237 fImpl->loop++; 3238 } 3239 } 3240 for (Val id = 0; id < (Val)instructions.size(); id++) { 3241 const OptimizedInstruction& inst = instructions[id]; 3242 if (!inst.can_hoist) { 3243 push_instruction(id, inst); 3244 } 3245 } 3246 } 3247 3248 #if defined(SKVM_JIT) 3249 3250 namespace SkVMJitTypes { 3251 #if defined(__x86_64__) || defined(_M_X64) 3252 using Reg = Assembler::Ymm; 3253 #elif defined(__aarch64__) 3254 using Reg = Assembler::V; 3255 #endif 3256 } // namespace SkVMJitTypes 3257 jit(const std::vector<OptimizedInstruction> & instructions,int * stack_hint,uint32_t * registers_used,Assembler * a) const3258 bool Program::jit(const std::vector<OptimizedInstruction>& instructions, 3259 int* stack_hint, 3260 uint32_t* registers_used, 3261 Assembler* a) const { 3262 using A = Assembler; 3263 using SkVMJitTypes::Reg; 3264 3265 SkTHashMap<int, A::Label> constants; // Constants (mostly splats) share the same pool. 3266 A::Label iota; // Varies per lane, for Op::index. 3267 A::Label load64_index; // Used to load low or high half of 64-bit lanes. 3268 3269 // The `regs` array tracks everything we know about each register's state: 3270 // - NA: empty 3271 // - RES: reserved by ABI 3272 // - TMP: holding a temporary 3273 // - id: holding Val id 3274 constexpr Val RES = NA-1, 3275 TMP = RES-1; 3276 3277 // Map val -> stack slot. 3278 std::vector<int> stack_slot(instructions.size(), NA); 3279 int next_stack_slot = 0; 3280 3281 const int nstack_slots = *stack_hint >= 0 ? *stack_hint 3282 : stack_slot.size(); 3283 #if defined(__x86_64__) || defined(_M_X64) 3284 if (!SkCpu::Supports(SkCpu::HSW)) { 3285 return false; 3286 } 3287 const int K = 8; 3288 #if defined(_M_X64) // Important to check this first; clang-cl defines both. 3289 const A::GP64 N = A::rcx, 3290 GP0 = A::rax, 3291 GP1 = A::r11, 3292 arg[] = { A::rdx, A::r8, A::r9, A::r10, A::rdi, A::rsi }; 3293 3294 // xmm6-15 need are callee-saved. 3295 std::array<Val,16> regs = { 3296 NA, NA, NA, NA, NA, NA,RES,RES, 3297 RES,RES,RES,RES, RES,RES,RES,RES, 3298 }; 3299 const uint32_t incoming_registers_used = *registers_used; 3300 3301 auto enter = [&]{ 3302 // rcx,rdx,r8,r9 are all already holding their correct values. 3303 // Load caller-saved r10 from rsp+40 if there's a fourth arg. 3304 if (fImpl->strides.size() >= 4) { 3305 a->mov(A::r10, A::Mem{A::rsp, 40}); 3306 } 3307 // Load callee-saved rdi from rsp+48 if there's a fifth arg, 3308 // first saving it to ABI reserved shadow area rsp+8. 3309 if (fImpl->strides.size() >= 5) { 3310 a->mov(A::Mem{A::rsp, 8}, A::rdi); 3311 a->mov(A::rdi, A::Mem{A::rsp, 48}); 3312 } 3313 // Load callee-saved rsi from rsp+56 if there's a sixth arg, 3314 // first saving it to ABI reserved shadow area rsp+16. 3315 if (fImpl->strides.size() >= 6) { 3316 a->mov(A::Mem{A::rsp, 16}, A::rsi); 3317 a->mov(A::rsi, A::Mem{A::rsp, 56}); 3318 } 3319 3320 // Allocate stack for our values and callee-saved xmm6-15. 3321 int stack_needed = nstack_slots*K*4; 3322 for (int r = 6; r < 16; r++) { 3323 if (incoming_registers_used & (1<<r)) { 3324 stack_needed += 16; 3325 } 3326 } 3327 if (stack_needed) { a->sub(A::rsp, stack_needed); } 3328 3329 int next_saved_xmm = nstack_slots*K*4; 3330 for (int r = 6; r < 16; r++) { 3331 if (incoming_registers_used & (1<<r)) { 3332 a->vmovups(A::Mem{A::rsp, next_saved_xmm}, (A::Xmm)r); 3333 next_saved_xmm += 16; 3334 regs[r] = NA; 3335 } 3336 } 3337 }; 3338 auto exit = [&]{ 3339 // The second pass of jit() shouldn't use any register it didn't in the first pass. 3340 SkASSERT((*registers_used & incoming_registers_used) == *registers_used); 3341 3342 // Restore callee-saved xmm6-15 and the stack pointer. 3343 int stack_used = nstack_slots*K*4; 3344 for (int r = 6; r < 16; r++) { 3345 if (incoming_registers_used & (1<<r)) { 3346 a->vmovups((A::Xmm)r, A::Mem{A::rsp, stack_used}); 3347 stack_used += 16; 3348 } 3349 } 3350 if (stack_used) { a->add(A::rsp, stack_used); } 3351 3352 // Restore callee-saved rdi/rsi if we used them. 3353 if (fImpl->strides.size() >= 5) { 3354 a->mov(A::rdi, A::Mem{A::rsp, 8}); 3355 } 3356 if (fImpl->strides.size() >= 6) { 3357 a->mov(A::rsi, A::Mem{A::rsp, 16}); 3358 } 3359 3360 a->vzeroupper(); 3361 a->ret(); 3362 }; 3363 #elif defined(__x86_64__) 3364 const A::GP64 N = A::rdi, 3365 GP0 = A::rax, 3366 GP1 = A::r11, 3367 arg[] = { A::rsi, A::rdx, A::rcx, A::r8, A::r9, A::r10 }; 3368 3369 // All 16 ymm registers are available to use. 3370 std::array<Val,16> regs = { 3371 NA,NA,NA,NA, NA,NA,NA,NA, 3372 NA,NA,NA,NA, NA,NA,NA,NA, 3373 }; 3374 3375 auto enter = [&]{ 3376 // Load caller-saved r10 from rsp+8 if there's a sixth arg. 3377 if (fImpl->strides.size() >= 6) { 3378 a->mov(A::r10, A::Mem{A::rsp, 8}); 3379 } 3380 if (nstack_slots) { a->sub(A::rsp, nstack_slots*K*4); } 3381 }; 3382 auto exit = [&]{ 3383 if (nstack_slots) { a->add(A::rsp, nstack_slots*K*4); } 3384 a->vzeroupper(); 3385 a->ret(); 3386 }; 3387 #endif 3388 3389 auto load_from_memory = [&](Reg r, Val v) { 3390 if (instructions[v].op == Op::splat) { 3391 if (instructions[v].immA == 0) { 3392 a->vpxor(r,r,r); 3393 } else { 3394 a->vmovups(r, constants.find(instructions[v].immA)); 3395 } 3396 } else { 3397 SkASSERT(stack_slot[v] != NA); 3398 a->vmovups(r, A::Mem{A::rsp, stack_slot[v]*K*4}); 3399 } 3400 }; 3401 auto store_to_stack = [&](Reg r, Val v) { 3402 SkASSERT(next_stack_slot < nstack_slots); 3403 stack_slot[v] = next_stack_slot++; 3404 a->vmovups(A::Mem{A::rsp, stack_slot[v]*K*4}, r); 3405 }; 3406 #elif defined(__aarch64__) 3407 const int K = 4; 3408 const A::X N = A::x0, 3409 GP0 = A::x8, 3410 GP1 = A::x9, 3411 arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 }; 3412 3413 // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15 in enter/exit. 3414 std::array<Val,32> regs = { 3415 NA, NA, NA, NA, NA, NA, NA, NA, 3416 RES,RES,RES,RES, RES,RES,RES,RES, 3417 NA, NA, NA, NA, NA, NA, NA, NA, 3418 NA, NA, NA, NA, NA, NA, NA, NA, 3419 }; 3420 3421 auto enter = [&]{ if (nstack_slots) { a->sub(A::sp, A::sp, nstack_slots*K*4); } }; 3422 auto exit = [&]{ if (nstack_slots) { a->add(A::sp, A::sp, nstack_slots*K*4); } 3423 a->ret(A::x30); }; 3424 3425 auto load_from_memory = [&](Reg r, Val v) { 3426 if (instructions[v].op == Op::splat) { 3427 if (instructions[v].immA == 0) { 3428 a->eor16b(r,r,r); 3429 } else { 3430 a->ldrq(r, constants.find(instructions[v].immA)); 3431 } 3432 } else { 3433 SkASSERT(stack_slot[v] != NA); 3434 a->ldrq(r, A::sp, stack_slot[v]); 3435 } 3436 }; 3437 auto store_to_stack = [&](Reg r, Val v) { 3438 SkASSERT(next_stack_slot < nstack_slots); 3439 stack_slot[v] = next_stack_slot++; 3440 a->strq(r, A::sp, stack_slot[v]); 3441 }; 3442 #endif 3443 3444 *registers_used = 0; // We'll update this as we go. 3445 3446 if (SK_ARRAY_COUNT(arg) < fImpl->strides.size()) { 3447 return false; 3448 } 3449 3450 auto emit = [&](Val id, bool scalar) { 3451 const int active_lanes = scalar ? 1 : K; 3452 const OptimizedInstruction& inst = instructions[id]; 3453 const Op op = inst.op; 3454 const Val x = inst.x, 3455 y = inst.y, 3456 z = inst.z, 3457 w = inst.w; 3458 const int immA = inst.immA, 3459 immB = inst.immB, 3460 immC = inst.immC; 3461 3462 // alloc_tmp() returns the first of N adjacent temporary registers, 3463 // each freed manually with free_tmp() or noted as our result with mark_tmp_as_dst(). 3464 auto alloc_tmp = [&](int N=1) -> Reg { 3465 auto needs_spill = [&](Val v) -> bool { 3466 SkASSERT(v >= 0); // {NA,TMP,RES} need to be handled before calling this. 3467 return stack_slot[v] == NA // We haven't spilled it already? 3468 && instructions[v].op != Op::splat; // No need to spill constants. 3469 }; 3470 3471 // We want to find a block of N adjacent registers requiring the fewest spills. 3472 int best_block = -1, 3473 min_spills = 0x7fff'ffff; 3474 for (int block = 0; block+N <= (int)regs.size(); block++) { 3475 int spills = 0; 3476 for (int r = block; r < block+N; r++) { 3477 Val v = regs[r]; 3478 // Registers holding NA (nothing) are ideal, nothing to spill. 3479 if (v == NA) { 3480 continue; 3481 } 3482 // We can't spill anything REServed or that we'll need this instruction. 3483 if (v == RES || 3484 v == TMP || v == id || v == x || v == y || v == z || v == w) { 3485 spills = 0x7fff'ffff; 3486 block = r; // (optimization) continue outer loop at next register. 3487 break; 3488 } 3489 // Usually here we've got a value v that we'd have to spill to the stack 3490 // before reusing its register, but sometimes even now we get a freebie. 3491 spills += needs_spill(v) ? 1 : 0; 3492 } 3493 3494 // TODO: non-arbitrary tie-breaking? 3495 if (min_spills > spills) { 3496 min_spills = spills; 3497 best_block = block; 3498 } 3499 if (min_spills == 0) { 3500 break; // (optimization) stop early if we find an unbeatable block. 3501 } 3502 } 3503 3504 // TODO: our search's success isn't obviously guaranteed... it depends on N 3505 // and the number and relative position in regs of any unspillable values. 3506 // I think we should be able to get away with N≤2 on x86-64 and N≤4 on arm64; 3507 // we'll need to revisit this logic should this assert fire. 3508 SkASSERT(min_spills <= N); 3509 3510 // Spill what needs spilling, and mark the block all as TMP. 3511 for (int r = best_block; r < best_block+N; r++) { 3512 Val& v = regs[r]; 3513 *registers_used |= (1<<r); 3514 3515 SkASSERT(v == NA || v >= 0); 3516 if (v >= 0 && needs_spill(v)) { 3517 store_to_stack((Reg)r, v); 3518 SkASSERT(!needs_spill(v)); 3519 min_spills--; 3520 } 3521 3522 v = TMP; 3523 } 3524 SkASSERT(min_spills == 0); 3525 return (Reg)best_block; 3526 }; 3527 3528 auto free_tmp = [&](Reg r) { 3529 SkASSERT(regs[r] == TMP); 3530 regs[r] = NA; 3531 }; 3532 3533 // Which register holds dst,x,y,z,w for this instruction? NA if none does yet. 3534 int rd = NA, 3535 rx = NA, 3536 ry = NA, 3537 rz = NA, 3538 rw = NA; 3539 3540 auto update_regs = [&](Reg r, Val v) { 3541 if (v == id) { rd = r; } 3542 if (v == x) { rx = r; } 3543 if (v == y) { ry = r; } 3544 if (v == z) { rz = r; } 3545 if (v == w) { rw = r; } 3546 return r; 3547 }; 3548 3549 auto find_existing_reg = [&](Val v) -> int { 3550 // Quick-check our working registers. 3551 if (v == id && rd != NA) { return rd; } 3552 if (v == x && rx != NA) { return rx; } 3553 if (v == y && ry != NA) { return ry; } 3554 if (v == z && rz != NA) { return rz; } 3555 if (v == w && rw != NA) { return rw; } 3556 3557 // Search inter-instruction register map. 3558 for (auto [r,val] : SkMakeEnumerate(regs)) { 3559 if (val == v) { 3560 return update_regs((Reg)r, v); 3561 } 3562 } 3563 return NA; 3564 }; 3565 3566 // Return a register for Val, holding that value if it already exists. 3567 // During this instruction all calls to r(v) will return the same register. 3568 auto r = [&](Val v) -> Reg { 3569 SkASSERT(v >= 0); 3570 3571 if (int found = find_existing_reg(v); found != NA) { 3572 return (Reg)found; 3573 } 3574 3575 Reg r = alloc_tmp(); 3576 SkASSERT(regs[r] == TMP); 3577 3578 SkASSERT(v <= id); 3579 if (v < id) { 3580 // If v < id, we're loading one of this instruction's inputs. 3581 // If v == id we're just allocating its destination register. 3582 load_from_memory(r, v); 3583 } 3584 regs[r] = v; 3585 return update_regs(r, v); 3586 }; 3587 3588 auto dies_here = [&](Val v) -> bool { 3589 SkASSERT(v >= 0); 3590 return instructions[v].death == id; 3591 }; 3592 3593 // Alias dst() to r(v) if dies_here(v). 3594 auto try_alias = [&](Val v) -> bool { 3595 SkASSERT(v == x || v == y || v == z || v == w); 3596 if (dies_here(v)) { 3597 rd = r(v); // Vals v and id share a register for this instruction. 3598 regs[rd] = id; // Next instruction, Val id will be in the register, not Val v. 3599 return true; 3600 } 3601 return false; 3602 }; 3603 3604 // Generally r(id), 3605 // but with a hint, try to alias dst() to r(v) if dies_here(v). 3606 auto dst = [&](Val hint1 = NA, Val hint2 = NA) -> Reg { 3607 if (hint1 != NA && try_alias(hint1)) { return r(id); } 3608 if (hint2 != NA && try_alias(hint2)) { return r(id); } 3609 return r(id); 3610 }; 3611 3612 #if defined(__aarch64__) // Nothing sneaky, just unused on x86-64. 3613 auto mark_tmp_as_dst = [&](Reg tmp) { 3614 SkASSERT(regs[tmp] == TMP); 3615 rd = tmp; 3616 regs[rd] = id; 3617 SkASSERT(dst() == tmp); 3618 }; 3619 #endif 3620 3621 #if defined(__x86_64__) || defined(_M_X64) 3622 // On x86 we can work with many values directly from the stack or program constant pool. 3623 auto any = [&](Val v) -> A::Operand { 3624 SkASSERT(v >= 0); 3625 SkASSERT(v < id); 3626 3627 if (int found = find_existing_reg(v); found != NA) { 3628 return (Reg)found; 3629 } 3630 if (instructions[v].op == Op::splat) { 3631 return constants.find(instructions[v].immA); 3632 } 3633 return A::Mem{A::rsp, stack_slot[v]*K*4}; 3634 }; 3635 3636 // This is never really worth asking except when any() might be used; 3637 // if we need this value in ARM, might as well just call r(v) to get it into a register. 3638 auto in_reg = [&](Val v) -> bool { 3639 return find_existing_reg(v) != NA; 3640 }; 3641 #endif 3642 3643 switch (op) { 3644 // Make sure splat constants can be found by load_from_memory() or any(). 3645 case Op::splat: 3646 (void)constants[immA]; 3647 break; 3648 3649 #if defined(__x86_64__) || defined(_M_X64) 3650 case Op::assert_true: { 3651 a->vptest (r(x), &constants[0xffffffff]); 3652 A::Label all_true; 3653 a->jc(&all_true); 3654 a->int3(); 3655 a->label(&all_true); 3656 } break; 3657 3658 case Op::trace_line: 3659 case Op::trace_var: 3660 case Op::trace_enter: 3661 case Op::trace_exit: 3662 case Op::trace_scope: 3663 /* Force this program to run in the interpreter. */ 3664 return false; 3665 3666 case Op::store8: 3667 if (scalar) { 3668 a->vpextrb(A::Mem{arg[immA]}, (A::Xmm)r(x), 0); 3669 } else { 3670 a->vpackusdw(dst(x), r(x), r(x)); 3671 a->vpermq (dst(), dst(), 0xd8); 3672 a->vpackuswb(dst(), dst(), dst()); 3673 a->vmovq (A::Mem{arg[immA]}, (A::Xmm)dst()); 3674 } break; 3675 3676 case Op::store16: 3677 if (scalar) { 3678 a->vpextrw(A::Mem{arg[immA]}, (A::Xmm)r(x), 0); 3679 } else { 3680 a->vpackusdw(dst(x), r(x), r(x)); 3681 a->vpermq (dst(), dst(), 0xd8); 3682 a->vmovups (A::Mem{arg[immA]}, (A::Xmm)dst()); 3683 } break; 3684 3685 case Op::store32: if (scalar) { a->vmovd (A::Mem{arg[immA]}, (A::Xmm)r(x)); } 3686 else { a->vmovups(A::Mem{arg[immA]}, r(x)); } 3687 break; 3688 3689 case Op::store64: if (scalar) { 3690 a->vmovd(A::Mem{arg[immA],0}, (A::Xmm)r(x)); 3691 a->vmovd(A::Mem{arg[immA],4}, (A::Xmm)r(y)); 3692 } else { 3693 // r(x) = {a,b,c,d|e,f,g,h} 3694 // r(y) = {i,j,k,l|m,n,o,p} 3695 // We want to write a,i,b,j,c,k,d,l,e,m... 3696 A::Ymm L = alloc_tmp(), 3697 H = alloc_tmp(); 3698 a->vpunpckldq(L, r(x), any(y)); // L = {a,i,b,j|e,m,f,n} 3699 a->vpunpckhdq(H, r(x), any(y)); // H = {c,k,d,l|g,o,h,p} 3700 a->vperm2f128(dst(), L,H, 0x20); // = {a,i,b,j|c,k,d,l} 3701 a->vmovups(A::Mem{arg[immA], 0}, dst()); 3702 a->vperm2f128(dst(), L,H, 0x31); // = {e,m,f,n|g,o,h,p} 3703 a->vmovups(A::Mem{arg[immA],32}, dst()); 3704 free_tmp(L); 3705 free_tmp(H); 3706 } break; 3707 3708 case Op::store128: { 3709 // TODO: >32-bit stores 3710 a->vmovd (A::Mem{arg[immA], 0*16 + 0}, (A::Xmm)r(x) ); 3711 a->vmovd (A::Mem{arg[immA], 0*16 + 4}, (A::Xmm)r(y) ); 3712 a->vmovd (A::Mem{arg[immA], 0*16 + 8}, (A::Xmm)r(z) ); 3713 a->vmovd (A::Mem{arg[immA], 0*16 + 12}, (A::Xmm)r(w) ); 3714 if (scalar) { break; } 3715 3716 a->vpextrd(A::Mem{arg[immA], 1*16 + 0}, (A::Xmm)r(x), 1); 3717 a->vpextrd(A::Mem{arg[immA], 1*16 + 4}, (A::Xmm)r(y), 1); 3718 a->vpextrd(A::Mem{arg[immA], 1*16 + 8}, (A::Xmm)r(z), 1); 3719 a->vpextrd(A::Mem{arg[immA], 1*16 + 12}, (A::Xmm)r(w), 1); 3720 3721 a->vpextrd(A::Mem{arg[immA], 2*16 + 0}, (A::Xmm)r(x), 2); 3722 a->vpextrd(A::Mem{arg[immA], 2*16 + 4}, (A::Xmm)r(y), 2); 3723 a->vpextrd(A::Mem{arg[immA], 2*16 + 8}, (A::Xmm)r(z), 2); 3724 a->vpextrd(A::Mem{arg[immA], 2*16 + 12}, (A::Xmm)r(w), 2); 3725 3726 a->vpextrd(A::Mem{arg[immA], 3*16 + 0}, (A::Xmm)r(x), 3); 3727 a->vpextrd(A::Mem{arg[immA], 3*16 + 4}, (A::Xmm)r(y), 3); 3728 a->vpextrd(A::Mem{arg[immA], 3*16 + 8}, (A::Xmm)r(z), 3); 3729 a->vpextrd(A::Mem{arg[immA], 3*16 + 12}, (A::Xmm)r(w), 3); 3730 // Now we need to store the upper 128 bits of x,y,z,w. 3731 // Storing in this order rather than interlacing minimizes temporaries. 3732 a->vextracti128(dst(), r(x), 1); 3733 a->vmovd (A::Mem{arg[immA], 4*16 + 0}, (A::Xmm)dst() ); 3734 a->vpextrd(A::Mem{arg[immA], 5*16 + 0}, (A::Xmm)dst(), 1); 3735 a->vpextrd(A::Mem{arg[immA], 6*16 + 0}, (A::Xmm)dst(), 2); 3736 a->vpextrd(A::Mem{arg[immA], 7*16 + 0}, (A::Xmm)dst(), 3); 3737 3738 a->vextracti128(dst(), r(y), 1); 3739 a->vmovd (A::Mem{arg[immA], 4*16 + 4}, (A::Xmm)dst() ); 3740 a->vpextrd(A::Mem{arg[immA], 5*16 + 4}, (A::Xmm)dst(), 1); 3741 a->vpextrd(A::Mem{arg[immA], 6*16 + 4}, (A::Xmm)dst(), 2); 3742 a->vpextrd(A::Mem{arg[immA], 7*16 + 4}, (A::Xmm)dst(), 3); 3743 3744 a->vextracti128(dst(), r(z), 1); 3745 a->vmovd (A::Mem{arg[immA], 4*16 + 8}, (A::Xmm)dst() ); 3746 a->vpextrd(A::Mem{arg[immA], 5*16 + 8}, (A::Xmm)dst(), 1); 3747 a->vpextrd(A::Mem{arg[immA], 6*16 + 8}, (A::Xmm)dst(), 2); 3748 a->vpextrd(A::Mem{arg[immA], 7*16 + 8}, (A::Xmm)dst(), 3); 3749 3750 a->vextracti128(dst(), r(w), 1); 3751 a->vmovd (A::Mem{arg[immA], 4*16 + 12}, (A::Xmm)dst() ); 3752 a->vpextrd(A::Mem{arg[immA], 5*16 + 12}, (A::Xmm)dst(), 1); 3753 a->vpextrd(A::Mem{arg[immA], 6*16 + 12}, (A::Xmm)dst(), 2); 3754 a->vpextrd(A::Mem{arg[immA], 7*16 + 12}, (A::Xmm)dst(), 3); 3755 } break; 3756 3757 case Op::load8: if (scalar) { 3758 a->vpxor (dst(), dst(), dst()); 3759 a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0); 3760 } else { 3761 a->vpmovzxbd(dst(), A::Mem{arg[immA]}); 3762 } break; 3763 3764 case Op::load16: if (scalar) { 3765 a->vpxor (dst(), dst(), dst()); 3766 a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0); 3767 } else { 3768 a->vpmovzxwd(dst(), A::Mem{arg[immA]}); 3769 } break; 3770 3771 case Op::load32: if (scalar) { a->vmovd ((A::Xmm)dst(), A::Mem{arg[immA]}); } 3772 else { a->vmovups( dst(), A::Mem{arg[immA]}); } 3773 break; 3774 3775 case Op::load64: if (scalar) { 3776 a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB}); 3777 } else { 3778 A::Ymm tmp = alloc_tmp(); 3779 a->vmovups(tmp, &load64_index); 3780 a->vpermps(dst(), tmp, A::Mem{arg[immA], 0}); 3781 a->vpermps( tmp, tmp, A::Mem{arg[immA], 32}); 3782 // Low 128 bits holds immB=0 lanes, high 128 bits holds immB=1. 3783 a->vperm2f128(dst(), dst(),tmp, immB ? 0x31 : 0x20); 3784 free_tmp(tmp); 3785 } break; 3786 3787 case Op::load128: if (scalar) { 3788 a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB}); 3789 } else { 3790 // Load 4 low values into xmm tmp, 3791 A::Ymm tmp = alloc_tmp(); 3792 A::Xmm t = (A::Xmm)tmp; 3793 a->vmovd (t, A::Mem{arg[immA], 0*16 + 4*immB} ); 3794 a->vpinsrd(t,t, A::Mem{arg[immA], 1*16 + 4*immB}, 1); 3795 a->vpinsrd(t,t, A::Mem{arg[immA], 2*16 + 4*immB}, 2); 3796 a->vpinsrd(t,t, A::Mem{arg[immA], 3*16 + 4*immB}, 3); 3797 3798 // Load 4 high values into xmm dst(), 3799 A::Xmm d = (A::Xmm)dst(); 3800 a->vmovd (d, A::Mem{arg[immA], 4*16 + 4*immB} ); 3801 a->vpinsrd(d,d, A::Mem{arg[immA], 5*16 + 4*immB}, 1); 3802 a->vpinsrd(d,d, A::Mem{arg[immA], 6*16 + 4*immB}, 2); 3803 a->vpinsrd(d,d, A::Mem{arg[immA], 7*16 + 4*immB}, 3); 3804 3805 // Merge the two, ymm dst() = {xmm tmp|xmm dst()} 3806 a->vperm2f128(dst(), tmp,dst(), 0x20); 3807 free_tmp(tmp); 3808 } break; 3809 3810 case Op::gather8: { 3811 // As usual, the gather base pointer is immB bytes off of uniform immA. 3812 a->mov(GP0, A::Mem{arg[immA], immB}); 3813 3814 A::Ymm tmp = alloc_tmp(); 3815 a->vmovups(tmp, any(x)); 3816 3817 for (int i = 0; i < active_lanes; i++) { 3818 if (i == 4) { 3819 // vpextrd can only pluck indices out from an Xmm register, 3820 // so we manually swap over to the top when we're halfway through. 3821 a->vextracti128((A::Xmm)tmp, tmp, 1); 3822 } 3823 a->vpextrd(GP1, (A::Xmm)tmp, i%4); 3824 a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::ONE}, i); 3825 } 3826 a->vpmovzxbd(dst(), dst()); 3827 free_tmp(tmp); 3828 } break; 3829 3830 case Op::gather16: { 3831 // Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd. 3832 a->mov(GP0, A::Mem{arg[immA], immB}); 3833 3834 A::Ymm tmp = alloc_tmp(); 3835 a->vmovups(tmp, any(x)); 3836 3837 for (int i = 0; i < active_lanes; i++) { 3838 if (i == 4) { 3839 a->vextracti128((A::Xmm)tmp, tmp, 1); 3840 } 3841 a->vpextrd(GP1, (A::Xmm)tmp, i%4); 3842 a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::TWO}, i); 3843 } 3844 a->vpmovzxwd(dst(), dst()); 3845 free_tmp(tmp); 3846 } break; 3847 3848 case Op::gather32: 3849 if (scalar) { 3850 // Our gather base pointer is immB bytes off of uniform immA. 3851 a->mov(GP0, A::Mem{arg[immA], immB}); 3852 3853 // Grab our index from lane 0 of the index argument. 3854 a->vmovd(GP1, (A::Xmm)r(x)); 3855 3856 // dst = *(base + 4*index) 3857 a->vmovd((A::Xmm)dst(x), A::Mem{GP0, 0, GP1, A::FOUR}); 3858 } else { 3859 a->mov(GP0, A::Mem{arg[immA], immB}); 3860 3861 A::Ymm mask = alloc_tmp(); 3862 a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.) 3863 3864 a->vgatherdps(dst(), A::FOUR, r(x), GP0, mask); 3865 free_tmp(mask); 3866 } 3867 break; 3868 3869 case Op::uniform32: a->vbroadcastss(dst(), A::Mem{arg[immA], immB}); 3870 break; 3871 3872 case Op::array32: a->mov(GP0, A::Mem{arg[immA], immB}); 3873 a->vbroadcastss(dst(), A::Mem{GP0, immC}); 3874 break; 3875 3876 case Op::index: a->vmovd((A::Xmm)dst(), N); 3877 a->vbroadcastss(dst(), dst()); 3878 a->vpsubd(dst(), dst(), &iota); 3879 break; 3880 3881 // We can swap the arguments of symmetric instructions to make better use of any(). 3882 case Op::add_f32: 3883 if (in_reg(x)) { a->vaddps(dst(x), r(x), any(y)); } 3884 else { a->vaddps(dst(y), r(y), any(x)); } 3885 break; 3886 3887 case Op::mul_f32: 3888 if (in_reg(x)) { a->vmulps(dst(x), r(x), any(y)); } 3889 else { a->vmulps(dst(y), r(y), any(x)); } 3890 break; 3891 3892 case Op::sub_f32: a->vsubps(dst(x), r(x), any(y)); break; 3893 case Op::div_f32: a->vdivps(dst(x), r(x), any(y)); break; 3894 case Op::min_f32: a->vminps(dst(y), r(y), any(x)); break; // Order matters, 3895 case Op::max_f32: a->vmaxps(dst(y), r(y), any(x)); break; // see test SkVM_min_max. 3896 3897 case Op::fma_f32: 3898 if (try_alias(x)) { a->vfmadd132ps(dst(x), r(z), any(y)); } else 3899 if (try_alias(y)) { a->vfmadd213ps(dst(y), r(x), any(z)); } else 3900 if (try_alias(z)) { a->vfmadd231ps(dst(z), r(x), any(y)); } else 3901 { a->vmovups (dst(), any(x)); 3902 a->vfmadd132ps(dst(), r(z), any(y)); } 3903 break; 3904 3905 case Op::fms_f32: 3906 if (try_alias(x)) { a->vfmsub132ps(dst(x), r(z), any(y)); } else 3907 if (try_alias(y)) { a->vfmsub213ps(dst(y), r(x), any(z)); } else 3908 if (try_alias(z)) { a->vfmsub231ps(dst(z), r(x), any(y)); } else 3909 { a->vmovups (dst(), any(x)); 3910 a->vfmsub132ps(dst(), r(z), any(y)); } 3911 break; 3912 3913 case Op::fnma_f32: 3914 if (try_alias(x)) { a->vfnmadd132ps(dst(x), r(z), any(y)); } else 3915 if (try_alias(y)) { a->vfnmadd213ps(dst(y), r(x), any(z)); } else 3916 if (try_alias(z)) { a->vfnmadd231ps(dst(z), r(x), any(y)); } else 3917 { a->vmovups (dst(), any(x)); 3918 a->vfnmadd132ps(dst(), r(z), any(y)); } 3919 break; 3920 3921 // In situations like this we want to try aliasing dst(x) when x is 3922 // already in a register, but not if we'd have to load it from the stack 3923 // just to alias it. That's done better directly into the new register. 3924 case Op::sqrt_f32: 3925 if (in_reg(x)) { a->vsqrtps(dst(x), r(x)); } 3926 else { a->vsqrtps(dst(), any(x)); } 3927 break; 3928 3929 case Op::add_i32: 3930 if (in_reg(x)) { a->vpaddd(dst(x), r(x), any(y)); } 3931 else { a->vpaddd(dst(y), r(y), any(x)); } 3932 break; 3933 3934 case Op::mul_i32: 3935 if (in_reg(x)) { a->vpmulld(dst(x), r(x), any(y)); } 3936 else { a->vpmulld(dst(y), r(y), any(x)); } 3937 break; 3938 3939 case Op::sub_i32: a->vpsubd(dst(x), r(x), any(y)); break; 3940 3941 case Op::bit_and: 3942 if (in_reg(x)) { a->vpand(dst(x), r(x), any(y)); } 3943 else { a->vpand(dst(y), r(y), any(x)); } 3944 break; 3945 case Op::bit_or: 3946 if (in_reg(x)) { a->vpor(dst(x), r(x), any(y)); } 3947 else { a->vpor(dst(y), r(y), any(x)); } 3948 break; 3949 case Op::bit_xor: 3950 if (in_reg(x)) { a->vpxor(dst(x), r(x), any(y)); } 3951 else { a->vpxor(dst(y), r(y), any(x)); } 3952 break; 3953 3954 case Op::bit_clear: a->vpandn(dst(y), r(y), any(x)); break; // Notice, y then x. 3955 3956 case Op::select: 3957 if (try_alias(z)) { a->vpblendvb(dst(z), r(z), any(y), r(x)); } 3958 else { a->vpblendvb(dst(x), r(z), any(y), r(x)); } 3959 break; 3960 3961 case Op::shl_i32: a->vpslld(dst(x), r(x), immA); break; 3962 case Op::shr_i32: a->vpsrld(dst(x), r(x), immA); break; 3963 case Op::sra_i32: a->vpsrad(dst(x), r(x), immA); break; 3964 3965 case Op::eq_i32: 3966 if (in_reg(x)) { a->vpcmpeqd(dst(x), r(x), any(y)); } 3967 else { a->vpcmpeqd(dst(y), r(y), any(x)); } 3968 break; 3969 3970 case Op::gt_i32: a->vpcmpgtd(dst(), r(x), any(y)); break; 3971 3972 case Op::eq_f32: 3973 if (in_reg(x)) { a->vcmpeqps(dst(x), r(x), any(y)); } 3974 else { a->vcmpeqps(dst(y), r(y), any(x)); } 3975 break; 3976 case Op::neq_f32: 3977 if (in_reg(x)) { a->vcmpneqps(dst(x), r(x), any(y)); } 3978 else { a->vcmpneqps(dst(y), r(y), any(x)); } 3979 break; 3980 3981 case Op:: gt_f32: a->vcmpltps (dst(y), r(y), any(x)); break; 3982 case Op::gte_f32: a->vcmpleps (dst(y), r(y), any(x)); break; 3983 3984 case Op::ceil: 3985 if (in_reg(x)) { a->vroundps(dst(x), r(x), Assembler::CEIL); } 3986 else { a->vroundps(dst(), any(x), Assembler::CEIL); } 3987 break; 3988 3989 case Op::floor: 3990 if (in_reg(x)) { a->vroundps(dst(x), r(x), Assembler::FLOOR); } 3991 else { a->vroundps(dst(), any(x), Assembler::FLOOR); } 3992 break; 3993 3994 case Op::to_f32: 3995 if (in_reg(x)) { a->vcvtdq2ps(dst(x), r(x)); } 3996 else { a->vcvtdq2ps(dst(), any(x)); } 3997 break; 3998 3999 case Op::trunc: 4000 if (in_reg(x)) { a->vcvttps2dq(dst(x), r(x)); } 4001 else { a->vcvttps2dq(dst(), any(x)); } 4002 break; 4003 4004 case Op::round: 4005 if (in_reg(x)) { a->vcvtps2dq(dst(x), r(x)); } 4006 else { a->vcvtps2dq(dst(), any(x)); } 4007 break; 4008 4009 case Op::to_fp16: 4010 a->vcvtps2ph(dst(x), r(x), A::CURRENT); // f32 ymm -> f16 xmm 4011 a->vpmovzxwd(dst(), dst()); // f16 xmm -> f16 ymm 4012 break; 4013 4014 case Op::from_fp16: 4015 a->vpackusdw(dst(x), r(x), r(x)); // f16 ymm -> f16 xmm 4016 a->vpermq (dst(), dst(), 0xd8); // swap middle two 64-bit lanes 4017 a->vcvtph2ps(dst(), dst()); // f16 xmm -> f32 ymm 4018 break; 4019 4020 case Op::duplicate: break; 4021 4022 #elif defined(__aarch64__) 4023 case Op::assert_true: { 4024 a->uminv4s(dst(), r(x)); // uminv acts like an all() across the vector. 4025 a->movs(GP0, dst(), 0); 4026 A::Label all_true; 4027 a->cbnz(GP0, &all_true); 4028 a->brk(0); 4029 a->label(&all_true); 4030 } break; 4031 4032 case Op::trace_line: 4033 case Op::trace_var: 4034 case Op::trace_enter: 4035 case Op::trace_exit: 4036 case Op::trace_scope: 4037 /* Force this program to run in the interpreter. */ 4038 return false; 4039 4040 case Op::index: { 4041 A::V tmp = alloc_tmp(); 4042 a->ldrq (tmp, &iota); 4043 a->dup4s(dst(), N); 4044 a->sub4s(dst(), dst(), tmp); 4045 free_tmp(tmp); 4046 } break; 4047 4048 case Op::store8: a->xtns2h(dst(x), r(x)); 4049 a->xtnh2b(dst(), dst()); 4050 if (scalar) { a->strb (dst(), arg[immA]); } 4051 else { a->strs (dst(), arg[immA]); } 4052 break; 4053 4054 case Op::store16: a->xtns2h(dst(x), r(x)); 4055 if (scalar) { a->strh (dst(), arg[immA]); } 4056 else { a->strd (dst(), arg[immA]); } 4057 break; 4058 4059 case Op::store32: if (scalar) { a->strs(r(x), arg[immA]); } 4060 else { a->strq(r(x), arg[immA]); } 4061 break; 4062 4063 case Op::store64: if (scalar) { 4064 a->strs(r(x), arg[immA], 0); 4065 a->strs(r(y), arg[immA], 1); 4066 } else if (r(y) == r(x)+1) { 4067 a->st24s(r(x), arg[immA]); 4068 } else { 4069 Reg tmp0 = alloc_tmp(2), 4070 tmp1 = (Reg)(tmp0+1); 4071 a->orr16b(tmp0, r(x), r(x)); 4072 a->orr16b(tmp1, r(y), r(y)); 4073 a-> st24s(tmp0, arg[immA]); 4074 free_tmp(tmp0); 4075 free_tmp(tmp1); 4076 } break; 4077 4078 case Op::store128: 4079 if (scalar) { 4080 a->strs(r(x), arg[immA], 0); 4081 a->strs(r(y), arg[immA], 1); 4082 a->strs(r(z), arg[immA], 2); 4083 a->strs(r(w), arg[immA], 3); 4084 } else if (r(y) == r(x)+1 && 4085 r(z) == r(x)+2 && 4086 r(w) == r(x)+3) { 4087 a->st44s(r(x), arg[immA]); 4088 } else { 4089 Reg tmp0 = alloc_tmp(4), 4090 tmp1 = (Reg)(tmp0+1), 4091 tmp2 = (Reg)(tmp0+2), 4092 tmp3 = (Reg)(tmp0+3); 4093 a->orr16b(tmp0, r(x), r(x)); 4094 a->orr16b(tmp1, r(y), r(y)); 4095 a->orr16b(tmp2, r(z), r(z)); 4096 a->orr16b(tmp3, r(w), r(w)); 4097 a-> st44s(tmp0, arg[immA]); 4098 free_tmp(tmp0); 4099 free_tmp(tmp1); 4100 free_tmp(tmp2); 4101 free_tmp(tmp3); 4102 } break; 4103 4104 4105 case Op::load8: if (scalar) { a->ldrb(dst(), arg[immA]); } 4106 else { a->ldrs(dst(), arg[immA]); } 4107 a->uxtlb2h(dst(), dst()); 4108 a->uxtlh2s(dst(), dst()); 4109 break; 4110 4111 case Op::load16: if (scalar) { a->ldrh(dst(), arg[immA]); } 4112 else { a->ldrd(dst(), arg[immA]); } 4113 a->uxtlh2s(dst(), dst()); 4114 break; 4115 4116 case Op::load32: if (scalar) { a->ldrs(dst(), arg[immA]); } 4117 else { a->ldrq(dst(), arg[immA]); } 4118 break; 4119 4120 case Op::load64: if (scalar) { 4121 a->ldrs(dst(), arg[immA], immB); 4122 } else { 4123 Reg tmp0 = alloc_tmp(2), 4124 tmp1 = (Reg)(tmp0+1); 4125 a->ld24s(tmp0, arg[immA]); 4126 // TODO: return both 4127 switch (immB) { 4128 case 0: mark_tmp_as_dst(tmp0); free_tmp(tmp1); break; 4129 case 1: mark_tmp_as_dst(tmp1); free_tmp(tmp0); break; 4130 } 4131 } break; 4132 4133 case Op::load128: if (scalar) { 4134 a->ldrs(dst(), arg[immA], immB); 4135 } else { 4136 Reg tmp0 = alloc_tmp(4), 4137 tmp1 = (Reg)(tmp0+1), 4138 tmp2 = (Reg)(tmp0+2), 4139 tmp3 = (Reg)(tmp0+3); 4140 a->ld44s(tmp0, arg[immA]); 4141 // TODO: return all four 4142 switch (immB) { 4143 case 0: mark_tmp_as_dst(tmp0); break; 4144 case 1: mark_tmp_as_dst(tmp1); break; 4145 case 2: mark_tmp_as_dst(tmp2); break; 4146 case 3: mark_tmp_as_dst(tmp3); break; 4147 } 4148 if (immB != 0) { free_tmp(tmp0); } 4149 if (immB != 1) { free_tmp(tmp1); } 4150 if (immB != 2) { free_tmp(tmp2); } 4151 if (immB != 3) { free_tmp(tmp3); } 4152 } break; 4153 4154 case Op::uniform32: a->add(GP0, arg[immA], immB); 4155 a->ld1r4s(dst(), GP0); 4156 break; 4157 4158 case Op::array32: a->add(GP0, arg[immA], immB); 4159 a->ldrd(GP0, GP0); 4160 a->add(GP0, GP0, immC); 4161 a->ld1r4s(dst(), GP0); 4162 break; 4163 4164 case Op::gather8: { 4165 // As usual, the gather base pointer is immB bytes off of uniform immA. 4166 a->add (GP0, arg[immA], immB); // GP0 = &(gather base pointer) 4167 a->ldrd(GP0, GP0); // GP0 = gather base pointer 4168 4169 for (int i = 0; i < active_lanes; i++) { 4170 a->movs(GP1, r(x), i); // Extract index lane i into GP1. 4171 a->add (GP1, GP0, GP1); // Add the gather base pointer. 4172 a->ldrb(GP1, GP1); // Load that byte. 4173 a->inss(dst(x), GP1, i); // Insert it into dst() lane i. 4174 } 4175 } break; 4176 4177 // See gather8 for general idea; comments here only where gather16 differs. 4178 case Op::gather16: { 4179 a->add (GP0, arg[immA], immB); 4180 a->ldrd(GP0, GP0); 4181 for (int i = 0; i < active_lanes; i++) { 4182 a->movs(GP1, r(x), i); 4183 a->add (GP1, GP0, GP1, A::LSL, 1); // Scale index 2x into a byte offset. 4184 a->ldrh(GP1, GP1); // 2-byte load. 4185 a->inss(dst(x), GP1, i); 4186 } 4187 } break; 4188 4189 // See gather8 for general idea; comments here only where gather32 differs. 4190 case Op::gather32: { 4191 a->add (GP0, arg[immA], immB); 4192 a->ldrd(GP0, GP0); 4193 for (int i = 0; i < active_lanes; i++) { 4194 a->movs(GP1, r(x), i); 4195 a->add (GP1, GP0, GP1, A::LSL, 2); // Scale index 4x into a byte offset. 4196 a->ldrs(GP1, GP1); // 4-byte load. 4197 a->inss(dst(x), GP1, i); 4198 } 4199 } break; 4200 4201 case Op::add_f32: a->fadd4s(dst(x,y), r(x), r(y)); break; 4202 case Op::sub_f32: a->fsub4s(dst(x,y), r(x), r(y)); break; 4203 case Op::mul_f32: a->fmul4s(dst(x,y), r(x), r(y)); break; 4204 case Op::div_f32: a->fdiv4s(dst(x,y), r(x), r(y)); break; 4205 4206 case Op::sqrt_f32: a->fsqrt4s(dst(x), r(x)); break; 4207 4208 case Op::fma_f32: // fmla.4s is z += x*y 4209 if (try_alias(z)) { a->fmla4s( r(z), r(x), r(y)); } 4210 else { a->orr16b(dst(), r(z), r(z)); 4211 a->fmla4s(dst(), r(x), r(y)); } 4212 break; 4213 4214 case Op::fnma_f32: // fmls.4s is z -= x*y 4215 if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); } 4216 else { a->orr16b(dst(), r(z), r(z)); 4217 a->fmls4s(dst(), r(x), r(y)); } 4218 break; 4219 4220 case Op::fms_f32: // calculate z - xy, then negate to xy - z 4221 if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); } 4222 else { a->orr16b(dst(), r(z), r(z)); 4223 a->fmls4s(dst(), r(x), r(y)); } 4224 a->fneg4s(dst(), dst()); 4225 break; 4226 4227 case Op:: gt_f32: a->fcmgt4s (dst(x,y), r(x), r(y)); break; 4228 case Op::gte_f32: a->fcmge4s (dst(x,y), r(x), r(y)); break; 4229 case Op:: eq_f32: a->fcmeq4s (dst(x,y), r(x), r(y)); break; 4230 case Op::neq_f32: a->fcmeq4s (dst(x,y), r(x), r(y)); 4231 a->not16b (dst(), dst()); break; 4232 4233 4234 case Op::add_i32: a->add4s(dst(x,y), r(x), r(y)); break; 4235 case Op::sub_i32: a->sub4s(dst(x,y), r(x), r(y)); break; 4236 case Op::mul_i32: a->mul4s(dst(x,y), r(x), r(y)); break; 4237 4238 case Op::bit_and : a->and16b(dst(x,y), r(x), r(y)); break; 4239 case Op::bit_or : a->orr16b(dst(x,y), r(x), r(y)); break; 4240 case Op::bit_xor : a->eor16b(dst(x,y), r(x), r(y)); break; 4241 case Op::bit_clear: a->bic16b(dst(x,y), r(x), r(y)); break; 4242 4243 case Op::select: // bsl16b is x = x ? y : z 4244 if (try_alias(x)) { a->bsl16b( r(x), r(y), r(z)); } 4245 else { a->orr16b(dst(), r(x), r(x)); 4246 a->bsl16b(dst(), r(y), r(z)); } 4247 break; 4248 4249 // fmin4s and fmax4s don't work the way we want with NaN, 4250 // so we write them the long way: 4251 case Op::min_f32: // min(x,y) = y<x ? y : x 4252 a->fcmgt4s(dst(), r(x), r(y)); 4253 a->bsl16b (dst(), r(y), r(x)); 4254 break; 4255 4256 case Op::max_f32: // max(x,y) = x<y ? y : x 4257 a->fcmgt4s(dst(), r(y), r(x)); 4258 a->bsl16b (dst(), r(y), r(x)); 4259 break; 4260 4261 case Op::shl_i32: a-> shl4s(dst(x), r(x), immA); break; 4262 case Op::shr_i32: a->ushr4s(dst(x), r(x), immA); break; 4263 case Op::sra_i32: a->sshr4s(dst(x), r(x), immA); break; 4264 4265 case Op::eq_i32: a->cmeq4s(dst(x,y), r(x), r(y)); break; 4266 case Op::gt_i32: a->cmgt4s(dst(x,y), r(x), r(y)); break; 4267 4268 case Op::to_f32: a->scvtf4s (dst(x), r(x)); break; 4269 case Op::trunc: a->fcvtzs4s(dst(x), r(x)); break; 4270 case Op::round: a->fcvtns4s(dst(x), r(x)); break; 4271 case Op::ceil: a->frintp4s(dst(x), r(x)); break; 4272 case Op::floor: a->frintm4s(dst(x), r(x)); break; 4273 4274 case Op::to_fp16: 4275 a->fcvtn (dst(x), r(x)); // 4x f32 -> 4x f16 in bottom four lanes 4276 a->uxtlh2s(dst(), dst()); // expand to 4x f16 in even 16-bit lanes 4277 break; 4278 4279 case Op::from_fp16: 4280 a->xtns2h(dst(x), r(x)); // pack even 16-bit lanes into bottom four lanes 4281 a->fcvtl (dst(), dst()); // 4x f16 -> 4x f32 4282 break; 4283 4284 case Op::duplicate: break; 4285 #endif 4286 } 4287 4288 // Proactively free the registers holding any value that dies here. 4289 if (rd != NA && dies_here(regs[rd])) { regs[rd] = NA; } 4290 if (rx != NA && regs[rx] != NA && dies_here(regs[rx])) { regs[rx] = NA; } 4291 if (ry != NA && regs[ry] != NA && dies_here(regs[ry])) { regs[ry] = NA; } 4292 if (rz != NA && regs[rz] != NA && dies_here(regs[rz])) { regs[rz] = NA; } 4293 if (rw != NA && regs[rw] != NA && dies_here(regs[rw])) { regs[rw] = NA; } 4294 return true; 4295 }; 4296 4297 #if defined(__x86_64__) || defined(_M_X64) 4298 auto jump_if_less = [&](A::Label* l) { a->jl (l); }; 4299 auto jump = [&](A::Label* l) { a->jmp(l); }; 4300 4301 auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); }; 4302 auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); }; 4303 #elif defined(__aarch64__) 4304 auto jump_if_less = [&](A::Label* l) { a->blt(l); }; 4305 auto jump = [&](A::Label* l) { a->b (l); }; 4306 4307 auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); }; 4308 auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); }; 4309 #endif 4310 4311 A::Label body, 4312 tail, 4313 done; 4314 4315 enter(); 4316 for (Val id = 0; id < (Val)instructions.size(); id++) { 4317 if (fImpl->visualizer && is_trace(instructions[id].op)) { 4318 // Make sure trace commands stay on JIT for visualizer 4319 continue; 4320 } 4321 auto start = a->size(); 4322 if (instructions[id].can_hoist && !emit(id, /*scalar=*/false)) { 4323 return false; 4324 } 4325 if (fImpl->visualizer && instructions[id].can_hoist) { 4326 fImpl->visualizer->addMachineCommands(id, start, a->size()); 4327 } 4328 } 4329 4330 // This point marks a kind of canonical fixed point for register contents: if loop 4331 // code is generated as if these registers are holding these values, the next time 4332 // the loop comes around we'd better find those same registers holding those same values. 4333 auto restore_incoming_regs = [&,incoming=regs,saved_stack_slot=stack_slot, 4334 saved_next_stack_slot=next_stack_slot]{ 4335 for (int r = 0; r < (int)regs.size(); r++) { 4336 if (regs[r] != incoming[r]) { 4337 regs[r] = incoming[r]; 4338 if (regs[r] >= 0) { 4339 load_from_memory((Reg)r, regs[r]); 4340 } 4341 } 4342 } 4343 *stack_hint = std::max(*stack_hint, next_stack_slot); 4344 stack_slot = saved_stack_slot; 4345 next_stack_slot = saved_next_stack_slot; 4346 }; 4347 4348 a->label(&body); 4349 { 4350 a->cmp(N, K); 4351 jump_if_less(&tail); 4352 for (Val id = 0; id < (Val)instructions.size(); id++) { 4353 if (fImpl->visualizer != nullptr && is_trace(instructions[id].op)) { 4354 // Make sure trace commands stay on JIT for visualizer 4355 continue; 4356 } 4357 auto start = a->size(); 4358 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/false)) { 4359 return false; 4360 } 4361 if (fImpl->visualizer && !instructions[id].can_hoist) { 4362 fImpl->visualizer->addMachineCommands(id, start, a->size()); 4363 } 4364 } 4365 restore_incoming_regs(); 4366 for (int i = 0; i < (int)fImpl->strides.size(); i++) { 4367 if (fImpl->strides[i]) { 4368 add(arg[i], K*fImpl->strides[i]); 4369 } 4370 } 4371 sub(N, K); 4372 jump(&body); 4373 } 4374 4375 a->label(&tail); 4376 { 4377 a->cmp(N, 1); 4378 jump_if_less(&done); 4379 for (Val id = 0; id < (Val)instructions.size(); id++) { 4380 if (fImpl->visualizer && is_trace(instructions[id].op)) { 4381 // Make sure trace commands stay on JIT for visualizer 4382 continue; 4383 } 4384 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/true)) { 4385 return false; 4386 } 4387 } 4388 restore_incoming_regs(); 4389 for (int i = 0; i < (int)fImpl->strides.size(); i++) { 4390 if (fImpl->strides[i]) { 4391 add(arg[i], 1*fImpl->strides[i]); 4392 } 4393 } 4394 sub(N, 1); 4395 jump(&tail); 4396 } 4397 4398 a->label(&done); 4399 { 4400 exit(); 4401 } 4402 4403 // Except for explicit aligned load and store instructions, AVX allows 4404 // memory operands to be unaligned. So even though we're creating 16 4405 // byte patterns on ARM or 32-byte patterns on x86, we only need to 4406 // align to 4 bytes, the element size and alignment requirement. 4407 4408 constants.foreach([&](int imm, A::Label* label) { 4409 a->align(4); 4410 a->label(label); 4411 for (int i = 0; i < K; i++) { 4412 a->word(imm); 4413 } 4414 }); 4415 4416 if (!iota.references.empty()) { 4417 a->align(4); 4418 a->label(&iota); // 0,1,2,3,4,... 4419 for (int i = 0; i < K; i++) { 4420 a->word(i); 4421 } 4422 } 4423 4424 if (!load64_index.references.empty()) { 4425 a->align(4); 4426 a->label(&load64_index); // {0,2,4,6|1,3,5,7} 4427 a->word(0); a->word(2); a->word(4); a->word(6); 4428 a->word(1); a->word(3); a->word(5); a->word(7); 4429 } 4430 4431 return true; 4432 } 4433 setupJIT(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)4434 void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions, 4435 const char* debug_name) { 4436 // Assemble with no buffer to determine a.size() (the number of bytes we'll assemble) 4437 // and stack_hint/registers_used to feed forward into the next jit() call. 4438 Assembler a{nullptr}; 4439 int stack_hint = -1; 4440 uint32_t registers_used = 0xffff'ffff; // Start conservatively with all. 4441 if (!this->jit(instructions, &stack_hint, ®isters_used, &a)) { 4442 return; 4443 } 4444 4445 fImpl->jit_size = a.size(); 4446 void* jit_entry = alloc_jit_buffer(&fImpl->jit_size); 4447 fImpl->jit_entry.store(jit_entry); 4448 4449 // Assemble the program for real with stack_hint/registers_used as feedback from first call. 4450 a = Assembler{jit_entry}; 4451 SkAssertResult(this->jit(instructions, &stack_hint, ®isters_used, &a)); 4452 SkASSERT(a.size() <= fImpl->jit_size); 4453 4454 // Remap as executable, and flush caches on platforms that need that. 4455 remap_as_executable(jit_entry, fImpl->jit_size); 4456 4457 notify_vtune(debug_name, jit_entry, fImpl->jit_size); 4458 4459 #if !defined(SK_BUILD_FOR_WIN) 4460 // For profiling and debugging, it's helpful to have this code loaded 4461 // dynamically rather than just jumping info fImpl->jit_entry. 4462 if (gSkVMJITViaDylib) { 4463 // Dump the raw program binary. 4464 SkString path = SkStringPrintf("/tmp/%s.XXXXXX", debug_name); 4465 int fd = mkstemp(path.writable_str()); 4466 ::write(fd, jit_entry, a.size()); 4467 close(fd); 4468 4469 this->dropJIT(); // (unmap and null out fImpl->jit_entry.) 4470 4471 // Convert it in-place to a dynamic library with a single symbol "skvm_jit": 4472 SkString cmd = SkStringPrintf( 4473 "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'" 4474 " | clang -x assembler -shared - -o %s", 4475 path.c_str(), path.c_str()); 4476 system(cmd.c_str()); 4477 4478 // Load that dynamic library and look up skvm_jit(). 4479 fImpl->dylib = dlopen(path.c_str(), RTLD_NOW|RTLD_LOCAL); 4480 void* sym = nullptr; 4481 for (const char* name : {"skvm_jit", "_skvm_jit"} ) { 4482 if (!sym) { sym = dlsym(fImpl->dylib, name); } 4483 } 4484 fImpl->jit_entry.store(sym); 4485 } 4486 #endif 4487 } 4488 disassemble(SkWStream * o) const4489 void Program::disassemble(SkWStream* o) const { 4490 #if !defined(SK_BUILD_FOR_WIN) 4491 SkDebugfStream debug; 4492 if (!o) { o = &debug; } 4493 4494 const void* jit_entry = fImpl->jit_entry.load(); 4495 size_t jit_size = fImpl->jit_size; 4496 4497 if (!jit_entry) { 4498 o->writeText("Program not JIT'd. Did you pass --jit?\n"); 4499 return; 4500 } 4501 4502 char path[] = "/tmp/skvm-jit.XXXXXX"; 4503 int fd = mkstemp(path); 4504 ::write(fd, jit_entry, jit_size); 4505 close(fd); 4506 4507 // Convert it in-place to a dynamic library with a single symbol "skvm_jit": 4508 SkString cmd = SkStringPrintf( 4509 "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'" 4510 " | clang -x assembler -shared - -o %s", 4511 path, path); 4512 system(cmd.c_str()); 4513 4514 // Now objdump to disassemble our function: 4515 // TODO: We could trim this down to just our code using '--disassemble=<symbol name>`, 4516 // but the symbol name varies with OS, and that option may be missing from objdump on some 4517 // machines? There also apears to be quite a bit of junk after the end of the JIT'd code. 4518 // Trimming that would let us pass '--visualize-jumps' and get the loop annotated. 4519 // With the junk, we tend to end up with a bunch of stray jumps that pollute the ASCII art. 4520 cmd = SkStringPrintf("objdump -D %s", path); 4521 #if defined(SK_BUILD_FOR_UNIX) 4522 cmd.append(" --section=.text"); 4523 #endif 4524 FILE* fp = popen(cmd.c_str(), "r"); 4525 if (!fp) { 4526 o->writeText("objdump failed\n"); 4527 return; 4528 } 4529 4530 char line[1024]; 4531 while (fgets(line, sizeof(line), fp)) { 4532 o->writeText(line); 4533 } 4534 4535 pclose(fp); 4536 #endif 4537 } 4538 4539 #endif 4540 4541 } // namespace skvm 4542