1 /* 2 * Copyright 2019 Google LLC 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "include/core/SkStream.h" 9 #include "include/core/SkString.h" 10 #include "include/private/base/SkTFitsIn.h" 11 #include "include/private/base/SkThreadID.h" 12 #include "src/base/SkHalf.h" 13 #include "src/core/SkColorSpacePriv.h" 14 #include "src/core/SkColorSpaceXformSteps.h" 15 #include "src/core/SkCpu.h" 16 #include "src/core/SkEnumerate.h" 17 #include "src/core/SkOpts.h" 18 #include "src/core/SkStreamPriv.h" 19 #include "src/core/SkVM.h" 20 #include "src/utils/SkVMVisualizer.h" 21 #include <algorithm> 22 #include <atomic> 23 #include <queue> 24 25 #if !defined(SK_BUILD_FOR_WIN) 26 #include <unistd.h> 27 #endif 28 29 bool gSkVMAllowJIT{false}; 30 bool gSkVMJITViaDylib{false}; 31 32 #if defined(SKVM_JIT) 33 #if defined(SK_BUILD_FOR_WIN) 34 #include "src/base/SkLeanWindows.h" 35 #include <memoryapi.h> 36 alloc_jit_buffer(size_t * len)37 static void* alloc_jit_buffer(size_t* len) { 38 return VirtualAlloc(NULL, *len, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); 39 } remap_as_executable(void * ptr,size_t len)40 static void remap_as_executable(void* ptr, size_t len) { 41 DWORD old; 42 VirtualProtect(ptr, len, PAGE_EXECUTE_READ, &old); 43 SkASSERT(old == PAGE_READWRITE); 44 } unmap_jit_buffer(void * ptr,size_t len)45 static void unmap_jit_buffer(void* ptr, size_t len) { 46 VirtualFree(ptr, 0, MEM_RELEASE); 47 } close_dylib(void * dylib)48 static void close_dylib(void* dylib) { 49 SkASSERT(false); // TODO? For now just assert we never make one. 50 } 51 #else 52 #include <dlfcn.h> 53 #include <sys/mman.h> 54 alloc_jit_buffer(size_t * len)55 static void* alloc_jit_buffer(size_t* len) { 56 // While mprotect and VirtualAlloc both work at page granularity, 57 // mprotect doesn't round up for you, and instead requires *len is at page granularity. 58 const size_t page = sysconf(_SC_PAGESIZE); 59 *len = ((*len + page - 1) / page) * page; 60 return mmap(nullptr,*len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0); 61 } remap_as_executable(void * ptr,size_t len)62 static void remap_as_executable(void* ptr, size_t len) { 63 mprotect(ptr, len, PROT_READ|PROT_EXEC); 64 __builtin___clear_cache((char*)ptr, 65 (char*)ptr + len); 66 } unmap_jit_buffer(void * ptr,size_t len)67 static void unmap_jit_buffer(void* ptr, size_t len) { 68 munmap(ptr, len); 69 } close_dylib(void * dylib)70 static void close_dylib(void* dylib) { 71 dlclose(dylib); 72 } 73 #endif 74 #endif 75 76 // JIT code isn't MSAN-instrumented, so we won't see when it uses 77 // uninitialized memory, and we'll not see the writes it makes as properly 78 // initializing memory. Instead force the interpreter, which should let 79 // MSAN see everything our programs do properly. 80 // 81 // Similarly, we can't get ASAN's checks unless we let it instrument our interpreter. 82 #if defined(__has_feature) 83 #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer) 84 #define SKVM_JIT_BUT_IGNORE_IT 85 #endif 86 #endif 87 88 #if defined(SKSL_STANDALONE) 89 // skslc needs to link against this module (for the VM code generator). This module pulls in 90 // color-space code, but attempting to add those transitive dependencies to skslc gets out of 91 // hand. So we terminate the chain here with stub functions. Note that skslc's usage of SkVM 92 // never cares about color management. sk_program_transfer_fn(skvm::F32 v,skcms_TFType tf_type,skvm::F32 G,skvm::F32 A,skvm::F32 B,skvm::F32 C,skvm::F32 D,skvm::F32 E,skvm::F32 F)93 skvm::F32 sk_program_transfer_fn( 94 skvm::F32 v, skcms_TFType tf_type, 95 skvm::F32 G, skvm::F32 A, skvm::F32 B, skvm::F32 C, skvm::F32 D, skvm::F32 E, skvm::F32 F) { 96 return v; 97 } 98 skcms_sRGB_TransferFunction()99 const skcms_TransferFunction* skcms_sRGB_TransferFunction() { return nullptr; } skcms_sRGB_Inverse_TransferFunction()100 const skcms_TransferFunction* skcms_sRGB_Inverse_TransferFunction() { return nullptr; } 101 #endif 102 103 namespace skvm { 104 detect_features()105 static Features detect_features() { 106 static const bool fma = 107 #if defined(SK_CPU_X86) 108 SkCpu::Supports(SkCpu::HSW); 109 #elif defined(SK_CPU_ARM64) 110 true; 111 #else 112 false; 113 #endif 114 115 static const bool fp16 = false; // TODO 116 117 return { fma, fp16 }; 118 } 119 Builder(bool createDuplicates)120 Builder::Builder(bool createDuplicates) 121 : fFeatures(detect_features()), fCreateDuplicates(createDuplicates) {} Builder(Features features,bool createDuplicates)122 Builder::Builder(Features features, bool createDuplicates) 123 : fFeatures(features ), fCreateDuplicates(createDuplicates) {} 124 125 struct Program::Impl { 126 std::vector<InterpreterInstruction> instructions; 127 int regs = 0; 128 int loop = 0; 129 std::vector<int> strides; 130 std::vector<TraceHook*> traceHooks; 131 std::unique_ptr<viz::Visualizer> visualizer; 132 133 std::atomic<void*> jit_entry{nullptr}; // TODO: minimal std::memory_orders 134 size_t jit_size = 0; 135 void* dylib = nullptr; 136 }; 137 138 // Debugging tools, mostly for printing various data structures out to a stream. 139 140 namespace { 141 struct V { Val id; }; 142 struct R { Reg id; }; 143 struct Shift { int bits; }; 144 struct Splat { int bits; }; 145 struct Hex { int bits; }; 146 struct TraceHookID { int bits; }; 147 // For op `trace_line` 148 struct Line { int bits; }; 149 // For op `trace_var` 150 struct VarSlot { int bits; }; 151 // For op `trace_enter`/`trace_exit` 152 struct FnIdx { int bits; }; 153 write(SkWStream * o,const char * s)154 static void write(SkWStream* o, const char* s) { 155 o->writeText(s); 156 } 157 name(Op op)158 static const char* name(Op op) { 159 switch (op) { 160 #define M(x) case Op::x: return #x; 161 SKVM_OPS(M) 162 #undef M 163 } 164 return "unknown op"; 165 } 166 write(SkWStream * o,Op op)167 static void write(SkWStream* o, Op op) { 168 o->writeText(name(op)); 169 } write(SkWStream * o,Ptr p)170 static void write(SkWStream* o, Ptr p) { 171 write(o, "ptr"); 172 o->writeDecAsText(p.ix); 173 } write(SkWStream * o,V v)174 static void write(SkWStream* o, V v) { 175 write(o, "v"); 176 o->writeDecAsText(v.id); 177 } write(SkWStream * o,R r)178 static void write(SkWStream* o, R r) { 179 write(o, "r"); 180 o->writeDecAsText(r.id); 181 } write(SkWStream * o,Shift s)182 static void write(SkWStream* o, Shift s) { 183 o->writeDecAsText(s.bits); 184 } write(SkWStream * o,Splat s)185 static void write(SkWStream* o, Splat s) { 186 float f; 187 memcpy(&f, &s.bits, 4); 188 o->writeHexAsText(s.bits); 189 write(o, " ("); 190 o->writeScalarAsText(f); 191 write(o, ")"); 192 } write(SkWStream * o,Hex h)193 static void write(SkWStream* o, Hex h) { 194 o->writeHexAsText(h.bits); 195 } write(SkWStream * o,TraceHookID h)196 static void write(SkWStream* o, TraceHookID h) { 197 o->writeDecAsText(h.bits); 198 } write(SkWStream * o,Line d)199 static void write(SkWStream* o, Line d) { 200 write(o, "L"); 201 o->writeDecAsText(d.bits); 202 } write(SkWStream * o,VarSlot s)203 static void write(SkWStream* o, VarSlot s) { 204 write(o, "$"); 205 o->writeDecAsText(s.bits); 206 } write(SkWStream * o,FnIdx s)207 static void write(SkWStream* o, FnIdx s) { 208 write(o, "F"); 209 o->writeDecAsText(s.bits); 210 } 211 template <typename T, typename... Ts> write(SkWStream * o,T first,Ts...rest)212 static void write(SkWStream* o, T first, Ts... rest) { 213 write(o, first); 214 write(o, " "); 215 write(o, rest...); 216 } 217 } // namespace 218 write_one_instruction(Val id,const OptimizedInstruction & inst,SkWStream * o)219 static void write_one_instruction(Val id, const OptimizedInstruction& inst, SkWStream* o) { 220 Op op = inst.op; 221 Val x = inst.x, 222 y = inst.y, 223 z = inst.z, 224 w = inst.w; 225 int immA = inst.immA, 226 immB = inst.immB, 227 immC = inst.immC; 228 switch (op) { 229 case Op::assert_true: write(o, op, V{x}, V{y}); break; 230 231 case Op::trace_line: write(o, op, TraceHookID{immA}, V{x}, V{y}, Line{immB}); break; 232 case Op::trace_var: write(o, op, TraceHookID{immA}, V{x}, V{y}, 233 VarSlot{immB}, "=", V{z}); break; 234 case Op::trace_enter: write(o, op, TraceHookID{immA}, V{x}, V{y}, FnIdx{immB}); break; 235 case Op::trace_exit: write(o, op, TraceHookID{immA}, V{x}, V{y}, FnIdx{immB}); break; 236 case Op::trace_scope: write(o, op, TraceHookID{immA}, V{x}, V{y}, Shift{immB}); break; 237 238 case Op::store8: write(o, op, Ptr{immA}, V{x} ); break; 239 case Op::store16: write(o, op, Ptr{immA}, V{x} ); break; 240 case Op::store32: write(o, op, Ptr{immA}, V{x} ); break; 241 case Op::store64: write(o, op, Ptr{immA}, V{x},V{y} ); break; 242 case Op::store128: write(o, op, Ptr{immA}, V{x},V{y},V{z},V{w}); break; 243 244 case Op::index: write(o, V{id}, "=", op); break; 245 246 case Op::load8: write(o, V{id}, "=", op, Ptr{immA}); break; 247 case Op::load16: write(o, V{id}, "=", op, Ptr{immA}); break; 248 case Op::load32: write(o, V{id}, "=", op, Ptr{immA}); break; 249 case Op::load64: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break; 250 case Op::load128: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break; 251 252 case Op::gather8: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break; 253 case Op::gather16: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break; 254 case Op::gather32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break; 255 256 case Op::uniform32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break; 257 case Op::array32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break; 258 259 case Op::splat: write(o, V{id}, "=", op, Splat{immA}); break; 260 261 case Op:: add_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 262 case Op:: sub_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 263 case Op:: mul_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 264 case Op:: div_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 265 case Op:: min_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 266 case Op:: max_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 267 case Op:: fma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 268 case Op:: fms_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 269 case Op::fnma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 270 271 272 case Op::sqrt_f32: write(o, V{id}, "=", op, V{x}); break; 273 274 case Op:: eq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 275 case Op::neq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 276 case Op:: gt_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 277 case Op::gte_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 278 279 280 case Op::add_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 281 case Op::sub_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 282 case Op::mul_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 283 284 case Op::shl_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break; 285 case Op::shr_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break; 286 case Op::sra_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break; 287 288 case Op::eq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 289 case Op::gt_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 290 291 292 case Op::bit_and : write(o, V{id}, "=", op, V{x}, V{y}); break; 293 case Op::bit_or : write(o, V{id}, "=", op, V{x}, V{y}); break; 294 case Op::bit_xor : write(o, V{id}, "=", op, V{x}, V{y}); break; 295 case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y}); break; 296 297 case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 298 299 case Op::ceil: write(o, V{id}, "=", op, V{x}); break; 300 case Op::floor: write(o, V{id}, "=", op, V{x}); break; 301 case Op::to_f32: write(o, V{id}, "=", op, V{x}); break; 302 case Op::to_fp16: write(o, V{id}, "=", op, V{x}); break; 303 case Op::from_fp16: write(o, V{id}, "=", op, V{x}); break; 304 case Op::trunc: write(o, V{id}, "=", op, V{x}); break; 305 case Op::round: write(o, V{id}, "=", op, V{x}); break; 306 307 case Op::duplicate: write(o, V{id}, "=", op, Hex{immA}); break; 308 } 309 310 write(o, "\n"); 311 } 312 dump(SkWStream * o) const313 void Builder::dump(SkWStream* o) const { 314 SkDebugfStream debug; 315 if (!o) { o = &debug; } 316 317 std::vector<OptimizedInstruction> optimized = this->optimize(); 318 o->writeDecAsText(optimized.size()); 319 o->writeText(" values (originally "); 320 o->writeDecAsText(fProgram.size()); 321 o->writeText("):\n"); 322 for (Val id = 0; id < (Val)optimized.size(); id++) { 323 const OptimizedInstruction& inst = optimized[id]; 324 write(o, inst.can_hoist ? "↑ " : " "); 325 write_one_instruction(id, inst, o); 326 } 327 } 328 visualize(SkWStream * output) const329 void Program::visualize(SkWStream* output) const { 330 if (fImpl->visualizer) { 331 fImpl->visualizer->dump(output); 332 } 333 } 334 visualizer()335 viz::Visualizer* Program::visualizer() { return fImpl->visualizer.get(); } dump(SkWStream * o) const336 void Program::dump(SkWStream* o) const { 337 SkDebugfStream debug; 338 if (!o) { o = &debug; } 339 340 o->writeDecAsText(fImpl->regs); 341 o->writeText(" registers, "); 342 o->writeDecAsText(fImpl->instructions.size()); 343 o->writeText(" instructions:\n"); 344 for (Val i = 0; i < (Val)fImpl->instructions.size(); i++) { 345 if (i == fImpl->loop) { write(o, "loop:\n"); } 346 o->writeDecAsText(i); 347 o->writeText("\t"); 348 if (i >= fImpl->loop) { write(o, " "); } 349 const InterpreterInstruction& inst = fImpl->instructions[i]; 350 Op op = inst.op; 351 Reg d = inst.d, 352 x = inst.x, 353 y = inst.y, 354 z = inst.z, 355 w = inst.w; 356 int immA = inst.immA, 357 immB = inst.immB, 358 immC = inst.immC; 359 switch (op) { 360 case Op::assert_true: write(o, op, R{x}, R{y}); break; 361 362 case Op::trace_line: write(o, op, TraceHookID{immA}, 363 R{x}, R{y}, Line{immB}); break; 364 case Op::trace_var: write(o, op, TraceHookID{immA}, R{x}, R{y}, 365 VarSlot{immB}, "=", R{z}); break; 366 case Op::trace_enter: write(o, op, TraceHookID{immA}, 367 R{x}, R{y}, FnIdx{immB}); break; 368 case Op::trace_exit: write(o, op, TraceHookID{immA}, 369 R{x}, R{y}, FnIdx{immB}); break; 370 case Op::trace_scope: write(o, op, TraceHookID{immA}, 371 R{x}, R{y}, Shift{immB}); break; 372 373 case Op::store8: write(o, op, Ptr{immA}, R{x} ); break; 374 case Op::store16: write(o, op, Ptr{immA}, R{x} ); break; 375 case Op::store32: write(o, op, Ptr{immA}, R{x} ); break; 376 case Op::store64: write(o, op, Ptr{immA}, R{x}, R{y} ); break; 377 case Op::store128: write(o, op, Ptr{immA}, R{x}, R{y}, R{z}, R{w}); break; 378 379 case Op::index: write(o, R{d}, "=", op); break; 380 381 case Op::load8: write(o, R{d}, "=", op, Ptr{immA}); break; 382 case Op::load16: write(o, R{d}, "=", op, Ptr{immA}); break; 383 case Op::load32: write(o, R{d}, "=", op, Ptr{immA}); break; 384 case Op::load64: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break; 385 case Op::load128: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break; 386 387 case Op::gather8: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break; 388 case Op::gather16: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break; 389 case Op::gather32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break; 390 391 case Op::uniform32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break; 392 case Op::array32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break; 393 394 case Op::splat: write(o, R{d}, "=", op, Splat{immA}); break; 395 396 case Op::add_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 397 case Op::sub_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 398 case Op::mul_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 399 case Op::div_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 400 case Op::min_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 401 case Op::max_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 402 case Op::fma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 403 case Op::fms_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 404 case Op::fnma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 405 406 case Op::sqrt_f32: write(o, R{d}, "=", op, R{x}); break; 407 408 case Op:: eq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 409 case Op::neq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 410 case Op:: gt_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 411 case Op::gte_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 412 413 414 case Op::add_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 415 case Op::sub_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 416 case Op::mul_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 417 418 case Op::shl_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break; 419 case Op::shr_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break; 420 case Op::sra_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break; 421 422 case Op::eq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 423 case Op::gt_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 424 425 case Op::bit_and : write(o, R{d}, "=", op, R{x}, R{y}); break; 426 case Op::bit_or : write(o, R{d}, "=", op, R{x}, R{y}); break; 427 case Op::bit_xor : write(o, R{d}, "=", op, R{x}, R{y}); break; 428 case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y}); break; 429 430 case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 431 432 case Op::ceil: write(o, R{d}, "=", op, R{x}); break; 433 case Op::floor: write(o, R{d}, "=", op, R{x}); break; 434 case Op::to_f32: write(o, R{d}, "=", op, R{x}); break; 435 case Op::to_fp16: write(o, R{d}, "=", op, R{x}); break; 436 case Op::from_fp16: write(o, R{d}, "=", op, R{x}); break; 437 case Op::trunc: write(o, R{d}, "=", op, R{x}); break; 438 case Op::round: write(o, R{d}, "=", op, R{x}); break; 439 440 case Op::duplicate: write(o, R{d}, "=", op, Hex{immA}); break; 441 } 442 write(o, "\n"); 443 } 444 } eliminate_dead_code(std::vector<Instruction> program,viz::Visualizer * visualizer)445 std::vector<Instruction> eliminate_dead_code(std::vector<Instruction> program, 446 viz::Visualizer* visualizer) { 447 // Determine which Instructions are live by working back from side effects. 448 std::vector<bool> live(program.size(), false); 449 for (Val id = program.size(); id--;) { 450 if (live[id] || has_side_effect(program[id].op)) { 451 live[id] = true; 452 const Instruction& inst = program[id]; 453 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) { 454 if (arg != NA) { live[arg] = true; } 455 } 456 } 457 } 458 459 // Rewrite the program with only live Instructions: 460 // - remap IDs in live Instructions to what they'll be once dead Instructions are removed; 461 // - then actually remove the dead Instructions. 462 std::vector<Val> new_id(program.size(), NA); 463 for (Val id = 0, next = 0; id < (Val)program.size(); id++) { 464 if (live[id]) { 465 Instruction& inst = program[id]; 466 for (Val* arg : {&inst.x, &inst.y, &inst.z, &inst.w}) { 467 if (*arg != NA) { 468 *arg = new_id[*arg]; 469 SkASSERT(*arg != NA); 470 } 471 } 472 new_id[id] = next++; 473 } 474 } 475 476 if (visualizer) { 477 visualizer->addInstructions(program); 478 visualizer->markAsDeadCode(live, new_id); 479 } 480 481 // Eliminate any non-live ops. 482 auto it = std::remove_if(program.begin(), program.end(), [&](const Instruction& inst) { 483 Val id = (Val)(&inst - program.data()); 484 return !live[id]; 485 }); 486 program.erase(it, program.end()); 487 488 return program; 489 } 490 finalize(const std::vector<Instruction> program,viz::Visualizer * visualizer)491 std::vector<OptimizedInstruction> finalize(const std::vector<Instruction> program, 492 viz::Visualizer* visualizer) { 493 std::vector<OptimizedInstruction> optimized(program.size()); 494 for (Val id = 0; id < (Val)program.size(); id++) { 495 Instruction inst = program[id]; 496 optimized[id] = {inst.op, inst.x,inst.y,inst.z,inst.w, 497 inst.immA,inst.immB,inst.immC, 498 /*death=*/id, /*can_hoist=*/true}; 499 } 500 501 // Each Instruction's inputs need to live at least until that Instruction issues. 502 for (Val id = 0; id < (Val)optimized.size(); id++) { 503 OptimizedInstruction& inst = optimized[id]; 504 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) { 505 // (We're walking in order, so this is the same as max()ing with the existing Val.) 506 if (arg != NA) { optimized[arg].death = id; } 507 } 508 } 509 510 // Mark which values don't depend on the loop and can be hoisted. 511 for (OptimizedInstruction& inst : optimized) { 512 // Varying loads (and gathers) and stores cannot be hoisted out of the loop. 513 if (is_always_varying(inst.op) || is_trace(inst.op)) { 514 inst.can_hoist = false; 515 } 516 517 // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself. 518 if (inst.can_hoist) { 519 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) { 520 if (arg != NA) { inst.can_hoist &= optimized[arg].can_hoist; } 521 } 522 } 523 } 524 525 // Extend the lifetime of any hoisted value that's used in the loop to infinity. 526 for (OptimizedInstruction& inst : optimized) { 527 if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used-in-loop*/) { 528 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) { 529 if (arg != NA && optimized[arg].can_hoist) { 530 optimized[arg].death = (Val)program.size(); 531 } 532 } 533 } 534 } 535 536 if (visualizer) { 537 visualizer->finalize(program, optimized); 538 } 539 540 return optimized; 541 } 542 optimize(viz::Visualizer * visualizer) const543 std::vector<OptimizedInstruction> Builder::optimize(viz::Visualizer* visualizer) const { 544 std::vector<Instruction> program = this->program(); 545 program = eliminate_dead_code(std::move(program), visualizer); 546 return finalize (std::move(program), visualizer); 547 } 548 done(const char * debug_name,bool allow_jit) const549 Program Builder::done(const char* debug_name, 550 bool allow_jit) const { 551 return this->done(debug_name, allow_jit, /*visualizer=*/nullptr); 552 } 553 done(const char * debug_name,bool allow_jit,std::unique_ptr<viz::Visualizer> visualizer) const554 Program Builder::done(const char* debug_name, 555 bool allow_jit, 556 std::unique_ptr<viz::Visualizer> visualizer) const { 557 char buf[64] = "skvm-jit-"; 558 if (!debug_name) { 559 *SkStrAppendU32(buf+9, this->hash()) = '\0'; 560 debug_name = buf; 561 } 562 563 auto optimized = this->optimize(visualizer ? visualizer.get() : nullptr); 564 return {optimized, 565 std::move(visualizer), 566 fStrides, 567 fTraceHooks, debug_name, allow_jit}; 568 } 569 hash() const570 uint64_t Builder::hash() const { 571 uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 0), 572 hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 1); 573 return (uint64_t)lo | (uint64_t)hi << 32; 574 } 575 operator !=(Ptr a,Ptr b)576 bool operator!=(Ptr a, Ptr b) { return a.ix != b.ix; } 577 operator ==(const Instruction & a,const Instruction & b)578 bool operator==(const Instruction& a, const Instruction& b) { 579 return a.op == b.op 580 && a.x == b.x 581 && a.y == b.y 582 && a.z == b.z 583 && a.w == b.w 584 && a.immA == b.immA 585 && a.immB == b.immB 586 && a.immC == b.immC; 587 } 588 operator ()(const Instruction & inst,uint32_t seed) const589 uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const { 590 return SkOpts::hash(&inst, sizeof(inst), seed); 591 } 592 593 594 // Most instructions produce a value and return it by ID, 595 // the value-producing instruction's own index in the program vector. push(Instruction inst)596 Val Builder::push(Instruction inst) { 597 // Basic common subexpression elimination: 598 // if we've already seen this exact Instruction, use it instead of creating a new one. 599 // 600 // But we never dedup loads or stores: an intervening store could change that memory. 601 // Uniforms and gathers touch only uniform memory, so they're fine to dedup, 602 // and index is varying but doesn't touch memory, so it's fine to dedup too. 603 if (!touches_varying_memory(inst.op) && !is_trace(inst.op)) { 604 if (Val* id = fIndex.find(inst)) { 605 if (fCreateDuplicates) { 606 inst.op = Op::duplicate; 607 inst.immA = *id; 608 fProgram.push_back(inst); 609 } 610 return *id; 611 } 612 } 613 614 Val id = static_cast<Val>(fProgram.size()); 615 fProgram.push_back(inst); 616 fIndex.set(inst, id); 617 return id; 618 } 619 arg(int stride)620 Ptr Builder::arg(int stride) { 621 int ix = (int)fStrides.size(); 622 fStrides.push_back(stride); 623 return {ix}; 624 } 625 assert_true(I32 cond,I32 debug)626 void Builder::assert_true(I32 cond, I32 debug) { 627 #ifdef SK_DEBUG 628 int imm; 629 if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; } 630 (void)push(Op::assert_true, cond.id, debug.id); 631 #endif 632 } 633 attachTraceHook(TraceHook * hook)634 int Builder::attachTraceHook(TraceHook* hook) { 635 int traceHookID = (int)fTraceHooks.size(); 636 fTraceHooks.push_back(hook); 637 return traceHookID; 638 } 639 mergeMasks(I32 & mask,I32 & traceMask)640 bool Builder::mergeMasks(I32& mask, I32& traceMask) { 641 if (this->isImm(mask.id, 0)) { return false; } 642 if (this->isImm(traceMask.id, 0)) { return false; } 643 if (this->isImm(mask.id, ~0)) { mask = traceMask; } 644 if (this->isImm(traceMask.id,~0)) { traceMask = mask; } 645 return true; 646 } 647 trace_line(int traceHookID,I32 mask,I32 traceMask,int line)648 void Builder::trace_line(int traceHookID, I32 mask, I32 traceMask, int line) { 649 SkASSERT(traceHookID >= 0); 650 SkASSERT(traceHookID < (int)fTraceHooks.size()); 651 if (!this->mergeMasks(mask, traceMask)) { return; } 652 (void)push(Op::trace_line, mask.id,traceMask.id,NA,NA, traceHookID, line); 653 } trace_var(int traceHookID,I32 mask,I32 traceMask,int slot,I32 val)654 void Builder::trace_var(int traceHookID, I32 mask, I32 traceMask, int slot, I32 val) { 655 SkASSERT(traceHookID >= 0); 656 SkASSERT(traceHookID < (int)fTraceHooks.size()); 657 if (!this->mergeMasks(mask, traceMask)) { return; } 658 (void)push(Op::trace_var, mask.id,traceMask.id,val.id,NA, traceHookID, slot); 659 } trace_enter(int traceHookID,I32 mask,I32 traceMask,int fnIdx)660 void Builder::trace_enter(int traceHookID, I32 mask, I32 traceMask, int fnIdx) { 661 SkASSERT(traceHookID >= 0); 662 SkASSERT(traceHookID < (int)fTraceHooks.size()); 663 if (!this->mergeMasks(mask, traceMask)) { return; } 664 (void)push(Op::trace_enter, mask.id,traceMask.id,NA,NA, traceHookID, fnIdx); 665 } trace_exit(int traceHookID,I32 mask,I32 traceMask,int fnIdx)666 void Builder::trace_exit(int traceHookID, I32 mask, I32 traceMask, int fnIdx) { 667 SkASSERT(traceHookID >= 0); 668 SkASSERT(traceHookID < (int)fTraceHooks.size()); 669 if (!this->mergeMasks(mask, traceMask)) { return; } 670 (void)push(Op::trace_exit, mask.id,traceMask.id,NA,NA, traceHookID, fnIdx); 671 } trace_scope(int traceHookID,I32 mask,I32 traceMask,int delta)672 void Builder::trace_scope(int traceHookID, I32 mask, I32 traceMask, int delta) { 673 SkASSERT(traceHookID >= 0); 674 SkASSERT(traceHookID < (int)fTraceHooks.size()); 675 if (!this->mergeMasks(mask, traceMask)) { return; } 676 (void)push(Op::trace_scope, mask.id,traceMask.id,NA,NA, traceHookID, delta); 677 } 678 store8(Ptr ptr,I32 val)679 void Builder::store8 (Ptr ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA,NA, ptr.ix); } store16(Ptr ptr,I32 val)680 void Builder::store16(Ptr ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA,NA, ptr.ix); } store32(Ptr ptr,I32 val)681 void Builder::store32(Ptr ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA,NA, ptr.ix); } store64(Ptr ptr,I32 lo,I32 hi)682 void Builder::store64(Ptr ptr, I32 lo, I32 hi) { 683 (void)push(Op::store64, lo.id,hi.id,NA,NA, ptr.ix); 684 } store128(Ptr ptr,I32 x,I32 y,I32 z,I32 w)685 void Builder::store128(Ptr ptr, I32 x, I32 y, I32 z, I32 w) { 686 (void)push(Op::store128, x.id,y.id,z.id,w.id, ptr.ix); 687 } 688 index()689 I32 Builder::index() { return {this, push(Op::index)}; } 690 load8(Ptr ptr)691 I32 Builder::load8 (Ptr ptr) { return {this, push(Op::load8 , NA,NA,NA,NA, ptr.ix) }; } load16(Ptr ptr)692 I32 Builder::load16(Ptr ptr) { return {this, push(Op::load16, NA,NA,NA,NA, ptr.ix) }; } load32(Ptr ptr)693 I32 Builder::load32(Ptr ptr) { return {this, push(Op::load32, NA,NA,NA,NA, ptr.ix) }; } load64(Ptr ptr,int lane)694 I32 Builder::load64(Ptr ptr, int lane) { 695 return {this, push(Op::load64 , NA,NA,NA,NA, ptr.ix,lane) }; 696 } load128(Ptr ptr,int lane)697 I32 Builder::load128(Ptr ptr, int lane) { 698 return {this, push(Op::load128, NA,NA,NA,NA, ptr.ix,lane) }; 699 } 700 gather8(UPtr ptr,int offset,I32 index)701 I32 Builder::gather8 (UPtr ptr, int offset, I32 index) { 702 return {this, push(Op::gather8 , index.id,NA,NA,NA, ptr.ix,offset)}; 703 } gather16(UPtr ptr,int offset,I32 index)704 I32 Builder::gather16(UPtr ptr, int offset, I32 index) { 705 return {this, push(Op::gather16, index.id,NA,NA,NA, ptr.ix,offset)}; 706 } gather32(UPtr ptr,int offset,I32 index)707 I32 Builder::gather32(UPtr ptr, int offset, I32 index) { 708 return {this, push(Op::gather32, index.id,NA,NA,NA, ptr.ix,offset)}; 709 } 710 uniform32(UPtr ptr,int offset)711 I32 Builder::uniform32(UPtr ptr, int offset) { 712 return {this, push(Op::uniform32, NA,NA,NA,NA, ptr.ix, offset)}; 713 } 714 715 // Note: this converts the array index into a byte offset for the op. array32(UPtr ptr,int offset,int index)716 I32 Builder::array32 (UPtr ptr, int offset, int index) { 717 return {this, push(Op::array32, NA,NA,NA,NA, ptr.ix, offset, index * sizeof(int))}; 718 } 719 splat(int n)720 I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA,NA, n) }; } 721 722 template <typename F32_or_I32> canonicalizeIdOrder(F32_or_I32 & x,F32_or_I32 & y)723 void Builder::canonicalizeIdOrder(F32_or_I32& x, F32_or_I32& y) { 724 bool immX = fProgram[x.id].op == Op::splat; 725 bool immY = fProgram[y.id].op == Op::splat; 726 if (immX != immY) { 727 if (immX) { 728 // Prefer (val, imm) over (imm, val). 729 std::swap(x, y); 730 } 731 return; 732 } 733 if (x.id > y.id) { 734 // Prefer (lower-ID, higher-ID) over (higher-ID, lower-ID). 735 std::swap(x, y); 736 } 737 } 738 739 // Be careful peepholing float math! Transformations you might expect to 740 // be legal can fail in the face of NaN/Inf, e.g. 0*x is not always 0. 741 // Float peepholes must pass this equivalence test for all ~4B floats: 742 // 743 // bool equiv(float x, float y) { return (x == y) || (isnanf(x) && isnanf(y)); } 744 // 745 // unsigned bits = 0; 746 // do { 747 // float f; 748 // memcpy(&f, &bits, 4); 749 // if (!equiv(f, ...)) { 750 // abort(); 751 // } 752 // } while (++bits != 0); 753 add(F32 x,F32 y)754 F32 Builder::add(F32 x, F32 y) { 755 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); } 756 this->canonicalizeIdOrder(x, y); 757 if (this->isImm(y.id, 0.0f)) { return x; } // x+0 == x 758 759 if (fFeatures.fma) { 760 if (fProgram[x.id].op == Op::mul_f32) { 761 return {this, this->push(Op::fma_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)}; 762 } 763 if (fProgram[y.id].op == Op::mul_f32) { 764 return {this, this->push(Op::fma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)}; 765 } 766 } 767 return {this, this->push(Op::add_f32, x.id, y.id)}; 768 } 769 sub(F32 x,F32 y)770 F32 Builder::sub(F32 x, F32 y) { 771 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); } 772 if (this->isImm(y.id, 0.0f)) { return x; } // x-0 == x 773 if (fFeatures.fma) { 774 if (fProgram[x.id].op == Op::mul_f32) { 775 return {this, this->push(Op::fms_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)}; 776 } 777 if (fProgram[y.id].op == Op::mul_f32) { 778 return {this, this->push(Op::fnma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)}; 779 } 780 } 781 return {this, this->push(Op::sub_f32, x.id, y.id)}; 782 } 783 mul(F32 x,F32 y)784 F32 Builder::mul(F32 x, F32 y) { 785 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); } 786 this->canonicalizeIdOrder(x, y); 787 if (this->isImm(y.id, 1.0f)) { return x; } // x*1 == x 788 return {this, this->push(Op::mul_f32, x.id, y.id)}; 789 } 790 fast_mul(F32 x,F32 y)791 F32 Builder::fast_mul(F32 x, F32 y) { 792 if (this->isImm(x.id, 0.0f) || this->isImm(y.id, 0.0f)) { return splat(0.0f); } 793 return mul(x,y); 794 } 795 div(F32 x,F32 y)796 F32 Builder::div(F32 x, F32 y) { 797 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(sk_ieee_float_divide(X,Y)); } 798 if (this->isImm(y.id, 1.0f)) { return x; } // x/1 == x 799 return {this, this->push(Op::div_f32, x.id, y.id)}; 800 } 801 sqrt(F32 x)802 F32 Builder::sqrt(F32 x) { 803 if (float X; this->allImm(x.id,&X)) { return splat(std::sqrt(X)); } 804 return {this, this->push(Op::sqrt_f32, x.id)}; 805 } 806 807 // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. approx_log2(F32 x)808 F32 Builder::approx_log2(F32 x) { 809 // e - 127 is a fair approximation of log2(x) in its own right... 810 F32 e = mul(to_F32(pun_to_I32(x)), splat(1.0f / (1<<23))); 811 812 // ... but using the mantissa to refine its error is _much_ better. 813 F32 m = pun_to_F32(bit_or(bit_and(pun_to_I32(x), 0x007fffff), 814 0x3f000000)); 815 F32 approx = sub(e, 124.225514990f); 816 approx = sub(approx, mul(1.498030302f, m)); 817 approx = sub(approx, div(1.725879990f, add(0.3520887068f, m))); 818 819 return approx; 820 } 821 approx_pow2(F32 x)822 F32 Builder::approx_pow2(F32 x) { 823 constexpr float kInfinityBits = 0x7f800000; 824 825 F32 f = fract(x); 826 F32 approx = add(x, 121.274057500f); 827 approx = sub(approx, mul( 1.490129070f, f)); 828 approx = add(approx, div(27.728023300f, sub(4.84252568f, f))); 829 approx = mul(1.0f * (1<<23), approx); 830 approx = clamp(approx, 0, kInfinityBits); // guard against underflow/overflow 831 832 return pun_to_F32(round(approx)); 833 } 834 approx_powf(F32 x,F32 y)835 F32 Builder::approx_powf(F32 x, F32 y) { 836 // TODO: assert this instead? Sometimes x is very slightly negative. See skia:10210. 837 x = max(0.0f, x); 838 839 if (this->isImm(x.id, 1.0f)) { return x; } // 1^y is one 840 if (this->isImm(x.id, 2.0f)) { return this->approx_pow2(y); } // 2^y is pow2(y) 841 if (this->isImm(y.id, 0.5f)) { return this->sqrt(x); } // x^0.5 is sqrt(x) 842 if (this->isImm(y.id, 1.0f)) { return x; } // x^1 is x 843 if (this->isImm(y.id, 2.0f)) { return x * x; } // x^2 is x*x 844 845 auto is_x = bit_or(eq(x, 0.0f), 846 eq(x, 1.0f)); 847 return select(is_x, x, approx_pow2(mul(approx_log2(x), y))); 848 } 849 850 // Bhaskara I's sine approximation 851 // 16x(pi - x) / (5*pi^2 - 4x(pi - x) 852 // ... divide by 4 853 // 4x(pi - x) / 5*pi^2/4 - x(pi - x) 854 // 855 // This is a good approximation only for 0 <= x <= pi, so we use symmetries to get 856 // radians into that range first. 857 // approx_sin(F32 radians)858 F32 Builder::approx_sin(F32 radians) { 859 constexpr float Pi = SK_ScalarPI; 860 // x = radians mod 2pi 861 F32 x = fract(radians * (0.5f/Pi)) * (2*Pi); 862 I32 neg = x > Pi; // are we pi < x < 2pi --> need to negate result 863 x = select(neg, x - Pi, x); 864 865 F32 pair = x * (Pi - x); 866 x = 4.0f * pair / ((5*Pi*Pi/4) - pair); 867 x = select(neg, -x, x); 868 return x; 869 } 870 871 /* "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION" 872 https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf 873 874 approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9 875 876 Some simplifications: 877 1. tan(x) is periodic, -PI/2 < x < PI/2 878 2. tan(x) is odd, so tan(-x) = -tan(x) 879 3. Our polynomial approximation is best near zero, so we use the following identity 880 tan(x) + tan(y) 881 tan(x + y) = ----------------- 882 1 - tan(x)*tan(y) 883 tan(PI/4) = 1 884 885 So for x > PI/8, we do the following refactor: 886 x' = x - PI/4 887 888 1 + tan(x') 889 tan(x) = ------------ 890 1 - tan(x') 891 */ approx_tan(F32 x)892 F32 Builder::approx_tan(F32 x) { 893 constexpr float Pi = SK_ScalarPI; 894 // periodic between -pi/2 ... pi/2 895 // shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back 896 x = fract((1/Pi)*x + 0.5f) * Pi - (Pi/2); 897 898 I32 neg = (x < 0.0f); 899 x = select(neg, -x, x); 900 901 // minimize total error by shifting if x > pi/8 902 I32 use_quotient = (x > (Pi/8)); 903 x = select(use_quotient, x - (Pi/4), x); 904 905 // 9th order poly = 4th order(x^2) * x 906 x = poly(x*x, 62/2835.0f, 17/315.0f, 2/15.0f, 1/3.0f, 1.0f) * x; 907 x = select(use_quotient, (1+x)/(1-x), x); 908 x = select(neg, -x, x); 909 return x; 910 } 911 912 // http://mathforum.org/library/drmath/view/54137.html 913 // referencing Handbook of Mathematical Functions, 914 // by Milton Abramowitz and Irene Stegun approx_asin(F32 x)915 F32 Builder::approx_asin(F32 x) { 916 I32 neg = (x < 0.0f); 917 x = select(neg, -x, x); 918 x = SK_ScalarPI/2 - sqrt(1-x) * poly(x, -0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f); 919 x = select(neg, -x, x); 920 return x; 921 } 922 923 /* Use 4th order polynomial approximation from https://arachnoid.com/polysolve/ 924 * with 129 values of x,atan(x) for x:[0...1] 925 * This only works for 0 <= x <= 1 926 */ approx_atan_unit(F32 x)927 static F32 approx_atan_unit(F32 x) { 928 // for now we might be given NaN... let that through 929 x->assert_true((x != x) | ((x >= 0) & (x <= 1))); 930 return poly(x, 0.14130025741326729f, 931 -0.34312835980675116f, 932 -0.016172900528248768f, 933 1.0037696976200385f, 934 -0.00014758242182738969f); 935 } 936 937 /* Use identity atan(x) = pi/2 - atan(1/x) for x > 1 938 */ approx_atan(F32 x)939 F32 Builder::approx_atan(F32 x) { 940 I32 neg = (x < 0.0f); 941 x = select(neg, -x, x); 942 I32 flip = (x > 1.0f); 943 x = select(flip, 1/x, x); 944 x = approx_atan_unit(x); 945 x = select(flip, SK_ScalarPI/2 - x, x); 946 x = select(neg, -x, x); 947 return x; 948 } 949 950 /* Use identity atan(x) = pi/2 - atan(1/x) for x > 1 951 * By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit() 952 * which avoids a 2nd divide instruction if we had instead called atan(). 953 */ approx_atan2(F32 y0,F32 x0)954 F32 Builder::approx_atan2(F32 y0, F32 x0) { 955 956 I32 flip = (abs(y0) > abs(x0)); 957 F32 y = select(flip, x0, y0); 958 F32 x = select(flip, y0, x0); 959 F32 arg = y/x; 960 961 I32 neg = (arg < 0.0f); 962 arg = select(neg, -arg, arg); 963 964 F32 r = approx_atan_unit(arg); 965 r = select(flip, SK_ScalarPI/2 - r, r); 966 r = select(neg, -r, r); 967 968 // handle quadrant distinctions 969 r = select((y0 >= 0) & (x0 < 0), r + SK_ScalarPI, r); 970 r = select((y0 < 0) & (x0 <= 0), r - SK_ScalarPI, r); 971 // Note: we don't try to handle 0,0 or infinities 972 return r; 973 } 974 min(F32 x,F32 y)975 F32 Builder::min(F32 x, F32 y) { 976 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::min(X,Y)); } 977 return {this, this->push(Op::min_f32, x.id, y.id)}; 978 } max(F32 x,F32 y)979 F32 Builder::max(F32 x, F32 y) { 980 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::max(X,Y)); } 981 return {this, this->push(Op::max_f32, x.id, y.id)}; 982 } 983 984 SK_NO_SANITIZE("signed-integer-overflow") add(I32 x,I32 y)985 I32 Builder::add(I32 x, I32 y) { 986 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); } 987 this->canonicalizeIdOrder(x, y); 988 if (this->isImm(y.id, 0)) { return x; } // x+0 == x 989 return {this, this->push(Op::add_i32, x.id, y.id)}; 990 } 991 SK_NO_SANITIZE("signed-integer-overflow") sub(I32 x,I32 y)992 I32 Builder::sub(I32 x, I32 y) { 993 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); } 994 if (this->isImm(y.id, 0)) { return x; } 995 return {this, this->push(Op::sub_i32, x.id, y.id)}; 996 } 997 SK_NO_SANITIZE("signed-integer-overflow") mul(I32 x,I32 y)998 I32 Builder::mul(I32 x, I32 y) { 999 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); } 1000 this->canonicalizeIdOrder(x, y); 1001 if (this->isImm(y.id, 0)) { return splat(0); } // x*0 == 0 1002 if (this->isImm(y.id, 1)) { return x; } // x*1 == x 1003 return {this, this->push(Op::mul_i32, x.id, y.id)}; 1004 } 1005 1006 SK_NO_SANITIZE("shift") shl(I32 x,int bits)1007 I32 Builder::shl(I32 x, int bits) { 1008 if (bits == 0) { return x; } 1009 if (int X; this->allImm(x.id,&X)) { return splat(X << bits); } 1010 return {this, this->push(Op::shl_i32, x.id,NA,NA,NA, bits)}; 1011 } shr(I32 x,int bits)1012 I32 Builder::shr(I32 x, int bits) { 1013 if (bits == 0) { return x; } 1014 if (int X; this->allImm(x.id,&X)) { return splat(unsigned(X) >> bits); } 1015 return {this, this->push(Op::shr_i32, x.id,NA,NA,NA, bits)}; 1016 } sra(I32 x,int bits)1017 I32 Builder::sra(I32 x, int bits) { 1018 if (bits == 0) { return x; } 1019 if (int X; this->allImm(x.id,&X)) { return splat(X >> bits); } 1020 return {this, this->push(Op::sra_i32, x.id,NA,NA,NA, bits)}; 1021 } 1022 eq(F32 x,F32 y)1023 I32 Builder:: eq(F32 x, F32 y) { 1024 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); } 1025 this->canonicalizeIdOrder(x, y); 1026 return {this, this->push(Op::eq_f32, x.id, y.id)}; 1027 } neq(F32 x,F32 y)1028 I32 Builder::neq(F32 x, F32 y) { 1029 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); } 1030 this->canonicalizeIdOrder(x, y); 1031 return {this, this->push(Op::neq_f32, x.id, y.id)}; 1032 } lt(F32 x,F32 y)1033 I32 Builder::lt(F32 x, F32 y) { 1034 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y> X ? ~0 : 0); } 1035 return {this, this->push(Op::gt_f32, y.id, x.id)}; 1036 } lte(F32 x,F32 y)1037 I32 Builder::lte(F32 x, F32 y) { 1038 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y>=X ? ~0 : 0); } 1039 return {this, this->push(Op::gte_f32, y.id, x.id)}; 1040 } gt(F32 x,F32 y)1041 I32 Builder::gt(F32 x, F32 y) { 1042 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); } 1043 return {this, this->push(Op::gt_f32, x.id, y.id)}; 1044 } gte(F32 x,F32 y)1045 I32 Builder::gte(F32 x, F32 y) { 1046 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); } 1047 return {this, this->push(Op::gte_f32, x.id, y.id)}; 1048 } 1049 eq(I32 x,I32 y)1050 I32 Builder:: eq(I32 x, I32 y) { 1051 if (x.id == y.id) { return splat(~0); } 1052 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); } 1053 this->canonicalizeIdOrder(x, y); 1054 return {this, this->push(Op:: eq_i32, x.id, y.id)}; 1055 } neq(I32 x,I32 y)1056 I32 Builder::neq(I32 x, I32 y) { 1057 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); } 1058 return ~(x == y); 1059 } gt(I32 x,I32 y)1060 I32 Builder:: gt(I32 x, I32 y) { 1061 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); } 1062 return {this, this->push(Op:: gt_i32, x.id, y.id)}; 1063 } gte(I32 x,I32 y)1064 I32 Builder::gte(I32 x, I32 y) { 1065 if (x.id == y.id) { return splat(~0); } 1066 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); } 1067 return ~(x < y); 1068 } lt(I32 x,I32 y)1069 I32 Builder:: lt(I32 x, I32 y) { return y>x; } lte(I32 x,I32 y)1070 I32 Builder::lte(I32 x, I32 y) { return y>=x; } 1071 holdsBitNot(Val id)1072 Val Builder::holdsBitNot(Val id) { 1073 // We represent `~x` as `x ^ ~0`. 1074 if (fProgram[id].op == Op::bit_xor && this->isImm(fProgram[id].y, ~0)) { 1075 return fProgram[id].x; 1076 } 1077 return NA; 1078 } 1079 bit_and(I32 x,I32 y)1080 I32 Builder::bit_and(I32 x, I32 y) { 1081 if (x.id == y.id) { return x; } 1082 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&Y); } 1083 this->canonicalizeIdOrder(x, y); 1084 if (this->isImm(y.id, 0)) { return splat(0); } // (x & false) == false 1085 if (this->isImm(y.id,~0)) { return x; } // (x & true) == x 1086 if (Val notX = this->holdsBitNot(x.id); notX != NA) { // (~x & y) == bit_clear(y, ~x) 1087 return bit_clear(y, {this, notX}); 1088 } 1089 if (Val notY = this->holdsBitNot(y.id); notY != NA) { // (x & ~y) == bit_clear(x, ~y) 1090 return bit_clear(x, {this, notY}); 1091 } 1092 return {this, this->push(Op::bit_and, x.id, y.id)}; 1093 } bit_or(I32 x,I32 y)1094 I32 Builder::bit_or(I32 x, I32 y) { 1095 if (x.id == y.id) { return x; } 1096 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|Y); } 1097 this->canonicalizeIdOrder(x, y); 1098 if (this->isImm(y.id, 0)) { return x; } // (x | false) == x 1099 if (this->isImm(y.id,~0)) { return splat(~0); } // (x | true) == true 1100 return {this, this->push(Op::bit_or, x.id, y.id)}; 1101 } bit_xor(I32 x,I32 y)1102 I32 Builder::bit_xor(I32 x, I32 y) { 1103 if (x.id == y.id) { return splat(0); } 1104 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X^Y); } 1105 this->canonicalizeIdOrder(x, y); 1106 if (this->isImm(y.id, 0)) { return x; } // (x ^ false) == x 1107 return {this, this->push(Op::bit_xor, x.id, y.id)}; 1108 } 1109 bit_clear(I32 x,I32 y)1110 I32 Builder::bit_clear(I32 x, I32 y) { 1111 if (x.id == y.id) { return splat(0); } 1112 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&~Y); } 1113 if (this->isImm(y.id, 0)) { return x; } // (x & ~false) == x 1114 if (this->isImm(y.id,~0)) { return splat(0); } // (x & ~true) == false 1115 if (this->isImm(x.id, 0)) { return splat(0); } // (false & ~y) == false 1116 return {this, this->push(Op::bit_clear, x.id, y.id)}; 1117 } 1118 select(I32 x,I32 y,I32 z)1119 I32 Builder::select(I32 x, I32 y, I32 z) { 1120 if (y.id == z.id) { return y; } 1121 if (int X,Y,Z; this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return splat(X?Y:Z); } 1122 if (this->isImm(x.id,~0)) { return y; } // (true ? y : z) == y 1123 if (this->isImm(x.id, 0)) { return z; } // (false ? y : z) == z 1124 if (this->isImm(y.id, 0)) { return bit_clear(z,x); } // (x ? 0 : z) == ~x&z 1125 if (this->isImm(z.id, 0)) { return bit_and (y,x); } // (x ? y : 0) == x&y 1126 if (Val notX = this->holdsBitNot(x.id); notX != NA) { // (!x ? y : z) == (x ? z : y) 1127 x.id = notX; 1128 std::swap(y, z); 1129 } 1130 return {this, this->push(Op::select, x.id, y.id, z.id)}; 1131 } 1132 extract(I32 x,int bits,I32 z)1133 I32 Builder::extract(I32 x, int bits, I32 z) { 1134 if (unsigned Z; this->allImm(z.id,&Z) && (~0u>>bits) == Z) { return shr(x, bits); } 1135 return bit_and(z, shr(x, bits)); 1136 } 1137 pack(I32 x,I32 y,int bits)1138 I32 Builder::pack(I32 x, I32 y, int bits) { 1139 return bit_or(x, shl(y, bits)); 1140 } 1141 ceil(F32 x)1142 F32 Builder::ceil(F32 x) { 1143 if (float X; this->allImm(x.id,&X)) { return splat(ceilf(X)); } 1144 return {this, this->push(Op::ceil, x.id)}; 1145 } floor(F32 x)1146 F32 Builder::floor(F32 x) { 1147 if (float X; this->allImm(x.id,&X)) { return splat(floorf(X)); } 1148 return {this, this->push(Op::floor, x.id)}; 1149 } to_F32(I32 x)1150 F32 Builder::to_F32(I32 x) { 1151 if (int X; this->allImm(x.id,&X)) { return splat((float)X); } 1152 return {this, this->push(Op::to_f32, x.id)}; 1153 } trunc(F32 x)1154 I32 Builder::trunc(F32 x) { 1155 if (float X; this->allImm(x.id,&X)) { return splat((int)X); } 1156 return {this, this->push(Op::trunc, x.id)}; 1157 } round(F32 x)1158 I32 Builder::round(F32 x) { 1159 if (float X; this->allImm(x.id,&X)) { return splat((int)lrintf(X)); } 1160 return {this, this->push(Op::round, x.id)}; 1161 } 1162 to_fp16(F32 x)1163 I32 Builder::to_fp16(F32 x) { 1164 if (float X; this->allImm(x.id,&X)) { return splat((int)SkFloatToHalf(X)); } 1165 return {this, this->push(Op::to_fp16, x.id)}; 1166 } from_fp16(I32 x)1167 F32 Builder::from_fp16(I32 x) { 1168 if (int X; this->allImm(x.id,&X)) { return splat(SkHalfToFloat(X)); } 1169 return {this, this->push(Op::from_fp16, x.id)}; 1170 } 1171 from_unorm(int bits,I32 x)1172 F32 Builder::from_unorm(int bits, I32 x) { 1173 F32 limit = splat(1 / ((1<<bits)-1.0f)); 1174 return mul(to_F32(x), limit); 1175 } to_unorm(int bits,F32 x)1176 I32 Builder::to_unorm(int bits, F32 x) { 1177 F32 limit = splat((1<<bits)-1.0f); 1178 return round(mul(x, limit)); 1179 } 1180 SkColorType_to_PixelFormat(SkColorType ct)1181 PixelFormat SkColorType_to_PixelFormat(SkColorType ct) { 1182 auto UNORM = PixelFormat::UNORM, 1183 SRGB = PixelFormat::SRGB, 1184 FLOAT = PixelFormat::FLOAT, 1185 XRNG = PixelFormat::XRNG; 1186 switch (ct) { 1187 case kUnknown_SkColorType: break; 1188 1189 case kRGBA_F32_SkColorType: return {FLOAT,32,32,32,32, 0,32,64,96}; 1190 1191 case kRGBA_F16Norm_SkColorType: return {FLOAT,16,16,16,16, 0,16,32,48}; 1192 case kRGBA_F16_SkColorType: return {FLOAT,16,16,16,16, 0,16,32,48}; 1193 case kR16G16B16A16_unorm_SkColorType: return {UNORM,16,16,16,16, 0,16,32,48}; 1194 1195 case kA16_float_SkColorType: return {FLOAT, 0, 0,0,16, 0, 0,0,0}; 1196 case kR16G16_float_SkColorType: return {FLOAT, 16,16,0, 0, 0,16,0,0}; 1197 1198 case kAlpha_8_SkColorType: return {UNORM, 0,0,0,8, 0,0,0,0}; 1199 case kGray_8_SkColorType: return {UNORM, 8,8,8,0, 0,0,0,0}; // Subtle. 1200 case kR8_unorm_SkColorType: return {UNORM, 8,0,0,0, 0,0,0,0}; 1201 1202 case kRGB_565_SkColorType: return {UNORM, 5,6,5,0, 11,5,0,0}; // (BGR) 1203 case kARGB_4444_SkColorType: return {UNORM, 4,4,4,4, 12,8,4,0}; // (ABGR) 1204 1205 case kRGBA_8888_SkColorType: return {UNORM, 8,8,8,8, 0,8,16,24}; 1206 case kRGB_888x_SkColorType: return {UNORM, 8,8,8,0, 0,8,16,32}; // 32-bit 1207 case kBGRA_8888_SkColorType: return {UNORM, 8,8,8,8, 16,8, 0,24}; 1208 case kSRGBA_8888_SkColorType: return { SRGB, 8,8,8,8, 0,8,16,24}; 1209 1210 case kRGBA_1010102_SkColorType: return {UNORM, 10,10,10,2, 0,10,20,30}; 1211 case kBGRA_1010102_SkColorType: return {UNORM, 10,10,10,2, 20,10, 0,30}; 1212 case kRGB_101010x_SkColorType: return {UNORM, 10,10,10,0, 0,10,20, 0}; 1213 case kBGR_101010x_SkColorType: return {UNORM, 10,10,10,0, 20,10, 0, 0}; 1214 case kBGR_101010x_XR_SkColorType: return { XRNG, 10,10,10,0, 20,10, 0, 0}; 1215 1216 case kR8G8_unorm_SkColorType: return {UNORM, 8, 8,0, 0, 0, 8,0,0}; 1217 case kR16G16_unorm_SkColorType: return {UNORM, 16,16,0, 0, 0,16,0,0}; 1218 case kA16_unorm_SkColorType: return {UNORM, 0, 0,0,16, 0, 0,0,0}; 1219 } 1220 SkASSERT(false); 1221 return {UNORM, 0,0,0,0, 0,0,0,0}; 1222 } 1223 byte_size(PixelFormat f)1224 static int byte_size(PixelFormat f) { 1225 // What's the highest bit we read? 1226 int bits = std::max(f.r_bits + f.r_shift, 1227 std::max(f.g_bits + f.g_shift, 1228 std::max(f.b_bits + f.b_shift, 1229 f.a_bits + f.a_shift))); 1230 // Round up to bytes. 1231 return (bits + 7) / 8; 1232 } 1233 unpack(PixelFormat f,I32 x)1234 static Color unpack(PixelFormat f, I32 x) { 1235 SkASSERT(byte_size(f) <= 4); 1236 1237 auto from_srgb = [](int bits, I32 channel) -> F32 { 1238 const skcms_TransferFunction* tf = skcms_sRGB_TransferFunction(); 1239 F32 v = from_unorm(bits, channel); 1240 return sk_program_transfer_fn(v, skcms_TFType_sRGBish, 1241 v->splat(tf->g), 1242 v->splat(tf->a), 1243 v->splat(tf->b), 1244 v->splat(tf->c), 1245 v->splat(tf->d), 1246 v->splat(tf->e), 1247 v->splat(tf->f)); 1248 }; 1249 auto from_xr = [](int bits, I32 channel) -> F32 { 1250 static constexpr float min = -0.752941f; 1251 static constexpr float max = 1.25098f; 1252 static constexpr float range = max - min; 1253 F32 v = from_unorm(bits, channel); 1254 return v * range + min; 1255 }; 1256 1257 auto unpack_rgb = [=](int bits, int shift) -> F32 { 1258 I32 channel = extract(x, shift, (1<<bits)-1); 1259 switch (f.encoding) { 1260 case PixelFormat::UNORM: return from_unorm(bits, channel); 1261 case PixelFormat:: SRGB: return from_srgb (bits, channel); 1262 case PixelFormat::FLOAT: return from_fp16 ( channel); 1263 case PixelFormat:: XRNG: return from_xr (bits, channel); 1264 } 1265 SkUNREACHABLE; 1266 }; 1267 auto unpack_alpha = [=](int bits, int shift) -> F32 { 1268 I32 channel = extract(x, shift, (1<<bits)-1); 1269 switch (f.encoding) { 1270 case PixelFormat::UNORM: 1271 case PixelFormat:: SRGB: return from_unorm(bits, channel); 1272 case PixelFormat::FLOAT: return from_fp16 ( channel); 1273 case PixelFormat:: XRNG: return from_xr (bits, channel); 1274 } 1275 SkUNREACHABLE; 1276 }; 1277 return { 1278 f.r_bits ? unpack_rgb (f.r_bits, f.r_shift) : x->splat(0.0f), 1279 f.g_bits ? unpack_rgb (f.g_bits, f.g_shift) : x->splat(0.0f), 1280 f.b_bits ? unpack_rgb (f.b_bits, f.b_shift) : x->splat(0.0f), 1281 f.a_bits ? unpack_alpha(f.a_bits, f.a_shift) : x->splat(1.0f), 1282 }; 1283 } 1284 split_disjoint_8byte_format(PixelFormat f,PixelFormat * lo,PixelFormat * hi)1285 static void split_disjoint_8byte_format(PixelFormat f, PixelFormat* lo, PixelFormat* hi) { 1286 SkASSERT(byte_size(f) == 8); 1287 // We assume some of the channels are in the low 32 bits, some in the high 32 bits. 1288 // The assert on byte_size(lo) will trigger if this assumption is violated. 1289 *lo = f; 1290 if (f.r_shift >= 32) { lo->r_bits = 0; lo->r_shift = 32; } 1291 if (f.g_shift >= 32) { lo->g_bits = 0; lo->g_shift = 32; } 1292 if (f.b_shift >= 32) { lo->b_bits = 0; lo->b_shift = 32; } 1293 if (f.a_shift >= 32) { lo->a_bits = 0; lo->a_shift = 32; } 1294 SkASSERT(byte_size(*lo) == 4); 1295 1296 *hi = f; 1297 if (f.r_shift < 32) { hi->r_bits = 0; hi->r_shift = 32; } else { hi->r_shift -= 32; } 1298 if (f.g_shift < 32) { hi->g_bits = 0; hi->g_shift = 32; } else { hi->g_shift -= 32; } 1299 if (f.b_shift < 32) { hi->b_bits = 0; hi->b_shift = 32; } else { hi->b_shift -= 32; } 1300 if (f.a_shift < 32) { hi->a_bits = 0; hi->a_shift = 32; } else { hi->a_shift -= 32; } 1301 SkASSERT(byte_size(*hi) == 4); 1302 } 1303 1304 // The only 16-byte format we support today is RGBA F32, 1305 // though, TODO, we could generalize that to any swizzle, and to allow UNORM too. assert_16byte_is_rgba_f32(PixelFormat f)1306 static void assert_16byte_is_rgba_f32(PixelFormat f) { 1307 #if defined(SK_DEBUG) 1308 SkASSERT(byte_size(f) == 16); 1309 PixelFormat rgba_f32 = SkColorType_to_PixelFormat(kRGBA_F32_SkColorType); 1310 1311 SkASSERT(f.encoding == rgba_f32.encoding); 1312 1313 SkASSERT(f.r_bits == rgba_f32.r_bits); 1314 SkASSERT(f.g_bits == rgba_f32.g_bits); 1315 SkASSERT(f.b_bits == rgba_f32.b_bits); 1316 SkASSERT(f.a_bits == rgba_f32.a_bits); 1317 1318 SkASSERT(f.r_shift == rgba_f32.r_shift); 1319 SkASSERT(f.g_shift == rgba_f32.g_shift); 1320 SkASSERT(f.b_shift == rgba_f32.b_shift); 1321 SkASSERT(f.a_shift == rgba_f32.a_shift); 1322 #endif 1323 } 1324 load(PixelFormat f,Ptr ptr)1325 Color Builder::load(PixelFormat f, Ptr ptr) { 1326 switch (byte_size(f)) { 1327 case 1: return unpack(f, load8 (ptr)); 1328 case 2: return unpack(f, load16(ptr)); 1329 case 4: return unpack(f, load32(ptr)); 1330 case 8: { 1331 PixelFormat lo,hi; 1332 split_disjoint_8byte_format(f, &lo,&hi); 1333 Color l = unpack(lo, load64(ptr, 0)), 1334 h = unpack(hi, load64(ptr, 1)); 1335 return { 1336 lo.r_bits ? l.r : h.r, 1337 lo.g_bits ? l.g : h.g, 1338 lo.b_bits ? l.b : h.b, 1339 lo.a_bits ? l.a : h.a, 1340 }; 1341 } 1342 case 16: { 1343 assert_16byte_is_rgba_f32(f); 1344 return { 1345 pun_to_F32(load128(ptr, 0)), 1346 pun_to_F32(load128(ptr, 1)), 1347 pun_to_F32(load128(ptr, 2)), 1348 pun_to_F32(load128(ptr, 3)), 1349 }; 1350 } 1351 default: SkUNREACHABLE; 1352 } 1353 } 1354 gather(PixelFormat f,UPtr ptr,int offset,I32 index)1355 Color Builder::gather(PixelFormat f, UPtr ptr, int offset, I32 index) { 1356 switch (byte_size(f)) { 1357 case 1: return unpack(f, gather8 (ptr, offset, index)); 1358 case 2: return unpack(f, gather16(ptr, offset, index)); 1359 case 4: return unpack(f, gather32(ptr, offset, index)); 1360 case 8: { 1361 PixelFormat lo,hi; 1362 split_disjoint_8byte_format(f, &lo,&hi); 1363 Color l = unpack(lo, gather32(ptr, offset, (index<<1)+0)), 1364 h = unpack(hi, gather32(ptr, offset, (index<<1)+1)); 1365 return { 1366 lo.r_bits ? l.r : h.r, 1367 lo.g_bits ? l.g : h.g, 1368 lo.b_bits ? l.b : h.b, 1369 lo.a_bits ? l.a : h.a, 1370 }; 1371 } 1372 case 16: { 1373 assert_16byte_is_rgba_f32(f); 1374 return { 1375 gatherF(ptr, offset, (index<<2)+0), 1376 gatherF(ptr, offset, (index<<2)+1), 1377 gatherF(ptr, offset, (index<<2)+2), 1378 gatherF(ptr, offset, (index<<2)+3), 1379 }; 1380 } 1381 default: SkUNREACHABLE; 1382 } 1383 } 1384 pack32(PixelFormat f,Color c)1385 static I32 pack32(PixelFormat f, Color c) { 1386 SkASSERT(byte_size(f) <= 4); 1387 1388 auto to_srgb = [](int bits, F32 v) { 1389 const skcms_TransferFunction* tf = skcms_sRGB_Inverse_TransferFunction(); 1390 return to_unorm(bits, sk_program_transfer_fn(v, skcms_TFType_sRGBish, 1391 v->splat(tf->g), 1392 v->splat(tf->a), 1393 v->splat(tf->b), 1394 v->splat(tf->c), 1395 v->splat(tf->d), 1396 v->splat(tf->e), 1397 v->splat(tf->f))); 1398 }; 1399 auto to_xr = [](int bits, F32 v) { 1400 static constexpr float min = -0.752941f; 1401 static constexpr float max = 1.25098f; 1402 static constexpr float range = max - min; 1403 return to_unorm(bits, (v - min) * (1.0f / range)); 1404 }; 1405 1406 I32 packed = c->splat(0); 1407 auto pack_rgb = [&](F32 channel, int bits, int shift) { 1408 I32 encoded; 1409 switch (f.encoding) { 1410 case PixelFormat::UNORM: encoded = to_unorm(bits, channel); break; 1411 case PixelFormat:: SRGB: encoded = to_srgb (bits, channel); break; 1412 case PixelFormat::FLOAT: encoded = to_fp16 ( channel); break; 1413 case PixelFormat:: XRNG: encoded = to_xr (bits, channel); break; 1414 } 1415 packed = pack(packed, encoded, shift); 1416 }; 1417 auto pack_alpha = [&](F32 channel, int bits, int shift) { 1418 I32 encoded; 1419 switch (f.encoding) { 1420 case PixelFormat::UNORM: 1421 case PixelFormat:: SRGB: encoded = to_unorm(bits, channel); break; 1422 case PixelFormat::FLOAT: encoded = to_fp16 ( channel); break; 1423 case PixelFormat:: XRNG: encoded = to_xr (bits, channel); break; 1424 } 1425 packed = pack(packed, encoded, shift); 1426 }; 1427 if (f.r_bits) { pack_rgb (c.r, f.r_bits, f.r_shift); } 1428 if (f.g_bits) { pack_rgb (c.g, f.g_bits, f.g_shift); } 1429 if (f.b_bits) { pack_rgb (c.b, f.b_bits, f.b_shift); } 1430 if (f.a_bits) { pack_alpha(c.a, f.a_bits, f.a_shift); } 1431 return packed; 1432 } 1433 store(PixelFormat f,Ptr ptr,Color c)1434 void Builder::store(PixelFormat f, Ptr ptr, Color c) { 1435 // Detect a grayscale PixelFormat: r,g,b bit counts and shifts all equal. 1436 if (f.r_bits == f.g_bits && f.g_bits == f.b_bits && 1437 f.r_shift == f.g_shift && f.g_shift == f.b_shift) { 1438 1439 // TODO: pull these coefficients from an SkColorSpace? This is sRGB luma/luminance. 1440 c.r = c.r * 0.2126f 1441 + c.g * 0.7152f 1442 + c.b * 0.0722f; 1443 f.g_bits = f.b_bits = 0; 1444 } 1445 1446 switch (byte_size(f)) { 1447 case 1: store8 (ptr, pack32(f,c)); break; 1448 case 2: store16(ptr, pack32(f,c)); break; 1449 case 4: store32(ptr, pack32(f,c)); break; 1450 case 8: { 1451 PixelFormat lo,hi; 1452 split_disjoint_8byte_format(f, &lo,&hi); 1453 store64(ptr, pack32(lo,c) 1454 , pack32(hi,c)); 1455 break; 1456 } 1457 case 16: { 1458 assert_16byte_is_rgba_f32(f); 1459 store128(ptr, pun_to_I32(c.r), pun_to_I32(c.g), pun_to_I32(c.b), pun_to_I32(c.a)); 1460 break; 1461 } 1462 default: SkUNREACHABLE; 1463 } 1464 } 1465 unpremul(F32 * r,F32 * g,F32 * b,F32 a)1466 void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) { 1467 skvm::F32 invA = 1.0f / a, 1468 inf = pun_to_F32(splat(0x7f800000)); 1469 // If a is 0, so are *r,*g,*b, so set invA to 0 to avoid 0*inf=NaN (instead 0*0 = 0). 1470 invA = select(invA < inf, invA 1471 , 0.0f); 1472 *r *= invA; 1473 *g *= invA; 1474 *b *= invA; 1475 } 1476 premul(F32 * r,F32 * g,F32 * b,F32 a)1477 void Builder::premul(F32* r, F32* g, F32* b, F32 a) { 1478 *r *= a; 1479 *g *= a; 1480 *b *= a; 1481 } 1482 uniformColor(SkColor4f color,Uniforms * uniforms)1483 Color Builder::uniformColor(SkColor4f color, Uniforms* uniforms) { 1484 auto [r,g,b,a] = color; 1485 return { 1486 uniformF(uniforms->pushF(r)), 1487 uniformF(uniforms->pushF(g)), 1488 uniformF(uniforms->pushF(b)), 1489 uniformF(uniforms->pushF(a)), 1490 }; 1491 } 1492 lerp(F32 lo,F32 hi,F32 t)1493 F32 Builder::lerp(F32 lo, F32 hi, F32 t) { 1494 if (this->isImm(t.id, 0.0f)) { return lo; } 1495 if (this->isImm(t.id, 1.0f)) { return hi; } 1496 return mad(sub(hi, lo), t, lo); 1497 } 1498 lerp(Color lo,Color hi,F32 t)1499 Color Builder::lerp(Color lo, Color hi, F32 t) { 1500 return { 1501 lerp(lo.r, hi.r, t), 1502 lerp(lo.g, hi.g, t), 1503 lerp(lo.b, hi.b, t), 1504 lerp(lo.a, hi.a, t), 1505 }; 1506 } 1507 to_hsla(Color c)1508 HSLA Builder::to_hsla(Color c) { 1509 F32 mx = max(max(c.r,c.g),c.b), 1510 mn = min(min(c.r,c.g),c.b), 1511 d = mx - mn, 1512 invd = 1.0f / d, 1513 g_lt_b = select(c.g < c.b, splat(6.0f) 1514 , splat(0.0f)); 1515 1516 F32 h = (1/6.0f) * select(mx == mn, 0.0f, 1517 select(mx == c.r, invd * (c.g - c.b) + g_lt_b, 1518 select(mx == c.g, invd * (c.b - c.r) + 2.0f 1519 , invd * (c.r - c.g) + 4.0f))); 1520 1521 F32 sum = mx + mn, 1522 l = sum * 0.5f, 1523 s = select(mx == mn, 0.0f 1524 , d / select(l > 0.5f, 2.0f - sum 1525 , sum)); 1526 return {h, s, l, c.a}; 1527 } 1528 to_rgba(HSLA c)1529 Color Builder::to_rgba(HSLA c) { 1530 // See GrRGBToHSLFilterEffect.fp 1531 1532 auto [h,s,l,a] = c; 1533 F32 x = s * (1.0f - abs(l + l - 1.0f)); 1534 1535 auto hue_to_rgb = [&,l=l](auto hue) { 1536 auto q = abs(6.0f * fract(hue) - 3.0f) - 1.0f; 1537 return x * (clamp01(q) - 0.5f) + l; 1538 }; 1539 1540 return { 1541 hue_to_rgb(h + 0/3.0f), 1542 hue_to_rgb(h + 2/3.0f), 1543 hue_to_rgb(h + 1/3.0f), 1544 c.a, 1545 }; 1546 } 1547 1548 // We're basing our implementation of non-separable blend modes on 1549 // https://www.w3.org/TR/compositing-1/#blendingnonseparable. 1550 // and 1551 // https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf 1552 // They're equivalent, but ES' math has been better simplified. 1553 // 1554 // Anything extra we add beyond that is to make the math work with premul inputs. 1555 saturation(skvm::F32 r,skvm::F32 g,skvm::F32 b)1556 static skvm::F32 saturation(skvm::F32 r, skvm::F32 g, skvm::F32 b) { 1557 return max(r, max(g, b)) 1558 - min(r, min(g, b)); 1559 } 1560 luminance(skvm::F32 r,skvm::F32 g,skvm::F32 b)1561 static skvm::F32 luminance(skvm::F32 r, skvm::F32 g, skvm::F32 b) { 1562 return r*0.30f + g*0.59f + b*0.11f; 1563 } 1564 set_sat(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 s)1565 static void set_sat(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) { 1566 F32 mn = min(*r, min(*g, *b)), 1567 mx = max(*r, max(*g, *b)), 1568 sat = mx - mn; 1569 1570 // Map min channel to 0, max channel to s, and scale the middle proportionally. 1571 auto scale = [&](skvm::F32 c) { 1572 auto scaled = ((c - mn) * s) / sat; 1573 return select(is_finite(scaled), scaled, 0.0f); 1574 }; 1575 *r = scale(*r); 1576 *g = scale(*g); 1577 *b = scale(*b); 1578 } 1579 set_lum(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 lu)1580 static void set_lum(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) { 1581 auto diff = lu - luminance(*r, *g, *b); 1582 *r += diff; 1583 *g += diff; 1584 *b += diff; 1585 } 1586 clip_color(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 a)1587 static void clip_color(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) { 1588 F32 mn = min(*r, min(*g, *b)), 1589 mx = max(*r, max(*g, *b)), 1590 lu = luminance(*r, *g, *b); 1591 1592 auto clip = [&](auto c) { 1593 c = select(mn < 0 & lu != mn, lu + ((c-lu)*( lu)) / (lu-mn), c); 1594 c = select(mx > a & lu != mx, lu + ((c-lu)*(a-lu)) / (mx-lu), c); 1595 return clamp01(c); // May be a little negative, or worse, NaN. 1596 }; 1597 *r = clip(*r); 1598 *g = clip(*g); 1599 *b = clip(*b); 1600 } 1601 blend(SkBlendMode mode,Color src,Color dst)1602 Color Builder::blend(SkBlendMode mode, Color src, Color dst) { 1603 auto mma = [](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) { 1604 return x*y + z*w; 1605 }; 1606 1607 auto two = [](skvm::F32 x) { return x+x; }; 1608 1609 auto apply_rgba = [&](auto fn) { 1610 return Color { 1611 fn(src.r, dst.r), 1612 fn(src.g, dst.g), 1613 fn(src.b, dst.b), 1614 fn(src.a, dst.a), 1615 }; 1616 }; 1617 1618 auto apply_rgb_srcover_a = [&](auto fn) { 1619 return Color { 1620 fn(src.r, dst.r), 1621 fn(src.g, dst.g), 1622 fn(src.b, dst.b), 1623 mad(dst.a, 1-src.a, src.a), // srcover for alpha 1624 }; 1625 }; 1626 1627 auto non_sep = [&](auto R, auto G, auto B) { 1628 return Color{ 1629 R + mma(src.r, 1-dst.a, dst.r, 1-src.a), 1630 G + mma(src.g, 1-dst.a, dst.g, 1-src.a), 1631 B + mma(src.b, 1-dst.a, dst.b, 1-src.a), 1632 mad(dst.a, 1-src.a, src.a), // srcover for alpha 1633 }; 1634 }; 1635 1636 switch (mode) { 1637 default: 1638 SkASSERT(false); 1639 [[fallthrough]]; /*but also, for safety, fallthrough*/ 1640 1641 case SkBlendMode::kClear: return { splat(0.0f), splat(0.0f), splat(0.0f), splat(0.0f) }; 1642 1643 case SkBlendMode::kSrc: return src; 1644 case SkBlendMode::kDst: return dst; 1645 1646 case SkBlendMode::kDstOver: std::swap(src, dst); [[fallthrough]]; 1647 case SkBlendMode::kSrcOver: 1648 return apply_rgba([&](auto s, auto d) { 1649 return mad(d,1-src.a, s); 1650 }); 1651 1652 case SkBlendMode::kDstIn: std::swap(src, dst); [[fallthrough]]; 1653 case SkBlendMode::kSrcIn: 1654 return apply_rgba([&](auto s, auto d) { 1655 return s * dst.a; 1656 }); 1657 1658 case SkBlendMode::kDstOut: std::swap(src, dst); [[fallthrough]]; 1659 1660 case SkBlendMode::kSrcOut: 1661 return apply_rgba([&](auto s, auto d) { 1662 return s * (1-dst.a); 1663 }); 1664 1665 case SkBlendMode::kDstATop: std::swap(src, dst); [[fallthrough]]; 1666 case SkBlendMode::kSrcATop: 1667 return apply_rgba([&](auto s, auto d) { 1668 return mma(s, dst.a, d, 1-src.a); 1669 }); 1670 1671 case SkBlendMode::kXor: 1672 return apply_rgba([&](auto s, auto d) { 1673 return mma(s, 1-dst.a, d, 1-src.a); 1674 }); 1675 1676 case SkBlendMode::kPlus: 1677 return apply_rgba([&](auto s, auto d) { 1678 return min(s+d, 1.0f); 1679 }); 1680 1681 case SkBlendMode::kModulate: 1682 return apply_rgba([&](auto s, auto d) { 1683 return s * d; 1684 }); 1685 1686 case SkBlendMode::kScreen: 1687 // (s+d)-(s*d) gave us trouble with our "r,g,b <= after blending" asserts. 1688 // It's kind of plausible that s + (d - sd) keeps more precision? 1689 return apply_rgba([&](auto s, auto d) { 1690 return s + (d - s*d); 1691 }); 1692 1693 case SkBlendMode::kDarken: 1694 return apply_rgb_srcover_a([&](auto s, auto d) { 1695 return s + (d - max(s * dst.a, 1696 d * src.a)); 1697 }); 1698 1699 case SkBlendMode::kLighten: 1700 return apply_rgb_srcover_a([&](auto s, auto d) { 1701 return s + (d - min(s * dst.a, 1702 d * src.a)); 1703 }); 1704 1705 case SkBlendMode::kDifference: 1706 return apply_rgb_srcover_a([&](auto s, auto d) { 1707 return s + (d - two(min(s * dst.a, 1708 d * src.a))); 1709 }); 1710 1711 case SkBlendMode::kExclusion: 1712 return apply_rgb_srcover_a([&](auto s, auto d) { 1713 return s + (d - two(s * d)); 1714 }); 1715 1716 case SkBlendMode::kColorBurn: 1717 return apply_rgb_srcover_a([&](auto s, auto d) { 1718 auto mn = min(dst.a, 1719 src.a * (dst.a - d) / s), 1720 burn = src.a * (dst.a - mn) + mma(s, 1-dst.a, d, 1-src.a); 1721 return select(d == dst.a , s * (1-dst.a) + d, 1722 select(is_finite(burn), burn 1723 , d * (1-src.a) + s)); 1724 }); 1725 1726 case SkBlendMode::kColorDodge: 1727 return apply_rgb_srcover_a([&](auto s, auto d) { 1728 auto dodge = src.a * min(dst.a, 1729 d * src.a / (src.a - s)) 1730 + mma(s, 1-dst.a, d, 1-src.a); 1731 return select(d == 0.0f , s * (1-dst.a) + d, 1732 select(is_finite(dodge), dodge 1733 , d * (1-src.a) + s)); 1734 }); 1735 1736 case SkBlendMode::kHardLight: 1737 return apply_rgb_srcover_a([&](auto s, auto d) { 1738 return mma(s, 1-dst.a, d, 1-src.a) + 1739 select(two(s) <= src.a, 1740 two(s * d), 1741 src.a * dst.a - two((dst.a - d) * (src.a - s))); 1742 }); 1743 1744 case SkBlendMode::kOverlay: 1745 return apply_rgb_srcover_a([&](auto s, auto d) { 1746 return mma(s, 1-dst.a, d, 1-src.a) + 1747 select(two(d) <= dst.a, 1748 two(s * d), 1749 src.a * dst.a - two((dst.a - d) * (src.a - s))); 1750 }); 1751 1752 case SkBlendMode::kMultiply: 1753 return apply_rgba([&](auto s, auto d) { 1754 return mma(s, 1-dst.a, d, 1-src.a) + s * d; 1755 }); 1756 1757 case SkBlendMode::kSoftLight: 1758 return apply_rgb_srcover_a([&](auto s, auto d) { 1759 auto m = select(dst.a > 0.0f, d / dst.a 1760 , 0.0f), 1761 s2 = two(s), 1762 m4 = 4*m; 1763 1764 // The logic forks three ways: 1765 // 1. dark src? 1766 // 2. light src, dark dst? 1767 // 3. light src, light dst? 1768 1769 // Used in case 1 1770 auto darkSrc = d * ((s2-src.a) * (1-m) + src.a), 1771 // Used in case 2 1772 darkDst = (m4 * m4 + m4) * (m-1) + 7*m, 1773 // Used in case 3. 1774 liteDst = sqrt(m) - m, 1775 // Used in 2 or 3? 1776 liteSrc = dst.a * (s2 - src.a) * select(4*d <= dst.a, darkDst 1777 , liteDst) 1778 + d * src.a; 1779 return s * (1-dst.a) + d * (1-src.a) + select(s2 <= src.a, darkSrc 1780 , liteSrc); 1781 }); 1782 1783 case SkBlendMode::kHue: { 1784 skvm::F32 R = src.r * src.a, 1785 G = src.g * src.a, 1786 B = src.b * src.a; 1787 1788 set_sat (&R, &G, &B, src.a * saturation(dst.r, dst.g, dst.b)); 1789 set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b)); 1790 clip_color(&R, &G, &B, src.a * dst.a); 1791 1792 return non_sep(R, G, B); 1793 } 1794 1795 case SkBlendMode::kSaturation: { 1796 skvm::F32 R = dst.r * src.a, 1797 G = dst.g * src.a, 1798 B = dst.b * src.a; 1799 1800 set_sat (&R, &G, &B, dst.a * saturation(src.r, src.g, src.b)); 1801 set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b)); 1802 clip_color(&R, &G, &B, src.a * dst.a); 1803 1804 return non_sep(R, G, B); 1805 } 1806 1807 case SkBlendMode::kColor: { 1808 skvm::F32 R = src.r * dst.a, 1809 G = src.g * dst.a, 1810 B = src.b * dst.a; 1811 1812 set_lum (&R, &G, &B, src.a * luminance(dst.r, dst.g, dst.b)); 1813 clip_color(&R, &G, &B, src.a * dst.a); 1814 1815 return non_sep(R, G, B); 1816 } 1817 1818 case SkBlendMode::kLuminosity: { 1819 skvm::F32 R = dst.r * src.a, 1820 G = dst.g * src.a, 1821 B = dst.b * src.a; 1822 1823 set_lum (&R, &G, &B, dst.a * luminance(src.r, src.g, src.b)); 1824 clip_color(&R, &G, &B, dst.a * src.a); 1825 1826 return non_sep(R, G, B); 1827 } 1828 } 1829 } 1830 1831 // ~~~~ Program::eval() and co. ~~~~ // 1832 1833 // Handy references for x86-64 instruction encoding: 1834 // https://wiki.osdev.org/X86-64_Instruction_Encoding 1835 // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm 1836 // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm 1837 // http://ref.x86asm.net/coder64.html 1838 1839 // Used for ModRM / immediate instruction encoding. _233(int a,int b,int c)1840 static uint8_t _233(int a, int b, int c) { 1841 return (a & 3) << 6 1842 | (b & 7) << 3 1843 | (c & 7) << 0; 1844 } 1845 1846 // ModRM byte encodes the arguments of an opcode. 1847 enum class Mod { Indirect, OneByteImm, FourByteImm, Direct }; mod_rm(Mod mod,int reg,int rm)1848 static uint8_t mod_rm(Mod mod, int reg, int rm) { 1849 return _233((int)mod, reg, rm); 1850 } 1851 mod(int imm)1852 static Mod mod(int imm) { 1853 if (imm == 0) { return Mod::Indirect; } 1854 if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; } 1855 return Mod::FourByteImm; 1856 } 1857 imm_bytes(Mod mod)1858 static int imm_bytes(Mod mod) { 1859 switch (mod) { 1860 case Mod::Indirect: return 0; 1861 case Mod::OneByteImm: return 1; 1862 case Mod::FourByteImm: return 4; 1863 case Mod::Direct: SkUNREACHABLE; 1864 } 1865 SkUNREACHABLE; 1866 } 1867 1868 // SIB byte encodes a memory address, base + (index * scale). sib(Assembler::Scale scale,int index,int base)1869 static uint8_t sib(Assembler::Scale scale, int index, int base) { 1870 return _233((int)scale, index, base); 1871 } 1872 1873 // The REX prefix is used to extend most old 32-bit instructions to 64-bit. rex(bool W,bool R,bool X,bool B)1874 static uint8_t rex(bool W, // If set, operation is 64-bit, otherwise default, usually 32-bit. 1875 bool R, // Extra top bit to select ModRM reg, registers 8-15. 1876 bool X, // Extra top bit for SIB index register. 1877 bool B) { // Extra top bit for SIB base or ModRM rm register. 1878 return 0b01000000 // Fixed 0100 for top four bits. 1879 | (W << 3) 1880 | (R << 2) 1881 | (X << 1) 1882 | (B << 0); 1883 } 1884 1885 1886 // The VEX prefix extends SSE operations to AVX. Used generally, even with XMM. 1887 struct VEX { 1888 int len; 1889 uint8_t bytes[3]; 1890 }; 1891 vex(bool WE,bool R,bool X,bool B,int map,int vvvv,bool L,int pp)1892 static VEX vex(bool WE, // Like REX W for int operations, or opcode extension for float? 1893 bool R, // Same as REX R. Pass high bit of dst register, dst>>3. 1894 bool X, // Same as REX X. 1895 bool B, // Same as REX B. Pass y>>3 for 3-arg ops, x>>3 for 2-arg. 1896 int map, // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f. 1897 int vvvv, // 4-bit second operand register. Pass our x for 3-arg ops. 1898 bool L, // Set for 256-bit ymm operations, off for 128-bit xmm. 1899 int pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none. 1900 1901 // Pack x86 opcode map selector to 5-bit VEX encoding. 1902 map = [map]{ 1903 switch (map) { 1904 case 0x0f: return 0b00001; 1905 case 0x380f: return 0b00010; 1906 case 0x3a0f: return 0b00011; 1907 // Several more cases only used by XOP / TBM. 1908 } 1909 SkUNREACHABLE; 1910 }(); 1911 1912 // Pack mandatory SSE opcode prefix byte to 2-bit VEX encoding. 1913 pp = [pp]{ 1914 switch (pp) { 1915 case 0x66: return 0b01; 1916 case 0xf3: return 0b10; 1917 case 0xf2: return 0b11; 1918 } 1919 return 0b00; 1920 }(); 1921 1922 VEX vex = {0, {0,0,0}}; 1923 if (X == 0 && B == 0 && WE == 0 && map == 0b00001) { 1924 // With these conditions met, we can optionally compress VEX to 2-byte. 1925 vex.len = 2; 1926 vex.bytes[0] = 0xc5; 1927 vex.bytes[1] = (pp & 3) << 0 1928 | (L & 1) << 2 1929 | (~vvvv & 15) << 3 1930 | (~(int)R & 1) << 7; 1931 } else { 1932 // We could use this 3-byte VEX prefix all the time if we like. 1933 vex.len = 3; 1934 vex.bytes[0] = 0xc4; 1935 vex.bytes[1] = (map & 31) << 0 1936 | (~(int)B & 1) << 5 1937 | (~(int)X & 1) << 6 1938 | (~(int)R & 1) << 7; 1939 vex.bytes[2] = (pp & 3) << 0 1940 | (L & 1) << 2 1941 | (~vvvv & 15) << 3 1942 | (WE & 1) << 7; 1943 } 1944 return vex; 1945 } 1946 Assembler(void * buf)1947 Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fSize(0) {} 1948 size() const1949 size_t Assembler::size() const { return fSize; } 1950 bytes(const void * p,int n)1951 void Assembler::bytes(const void* p, int n) { 1952 if (fCode) { 1953 memcpy(fCode+fSize, p, n); 1954 } 1955 fSize += n; 1956 } 1957 byte(uint8_t b)1958 void Assembler::byte(uint8_t b) { this->bytes(&b, 1); } word(uint32_t w)1959 void Assembler::word(uint32_t w) { this->bytes(&w, 4); } 1960 align(int mod)1961 void Assembler::align(int mod) { 1962 while (this->size() % mod) { 1963 this->byte(0x00); 1964 } 1965 } 1966 int3()1967 void Assembler::int3() { 1968 this->byte(0xcc); 1969 } 1970 vzeroupper()1971 void Assembler::vzeroupper() { 1972 this->byte(0xc5); 1973 this->byte(0xf8); 1974 this->byte(0x77); 1975 } ret()1976 void Assembler::ret() { this->byte(0xc3); } 1977 op(int opcode,Operand dst,GP64 x)1978 void Assembler::op(int opcode, Operand dst, GP64 x) { 1979 if (dst.kind == Operand::REG) { 1980 this->byte(rex(W1,x>>3,0,dst.reg>>3)); 1981 this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2); 1982 this->byte(mod_rm(Mod::Direct, x, dst.reg&7)); 1983 } else { 1984 SkASSERT(dst.kind == Operand::MEM); 1985 const Mem& m = dst.mem; 1986 const bool need_SIB = (m.base&7) == rsp 1987 || m.index != rsp; 1988 1989 this->byte(rex(W1,x>>3,m.index>>3,m.base>>3)); 1990 this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2); 1991 this->byte(mod_rm(mod(m.disp), x&7, (need_SIB ? rsp : m.base)&7)); 1992 if (need_SIB) { 1993 this->byte(sib(m.scale, m.index&7, m.base&7)); 1994 } 1995 this->bytes(&m.disp, imm_bytes(mod(m.disp))); 1996 } 1997 } 1998 op(int opcode,int opcode_ext,Operand dst,int imm)1999 void Assembler::op(int opcode, int opcode_ext, Operand dst, int imm) { 2000 opcode |= 0b1000'0000; // top bit set for instructions with any immediate 2001 2002 int imm_bytes = 4; 2003 if (SkTFitsIn<int8_t>(imm)) { 2004 imm_bytes = 1; 2005 opcode |= 0b0000'0010; // second bit set for 8-bit immediate, else 32-bit. 2006 } 2007 2008 this->op(opcode, dst, (GP64)opcode_ext); 2009 this->bytes(&imm, imm_bytes); 2010 } 2011 add(Operand dst,int imm)2012 void Assembler::add(Operand dst, int imm) { this->op(0x01,0b000, dst,imm); } sub(Operand dst,int imm)2013 void Assembler::sub(Operand dst, int imm) { this->op(0x01,0b101, dst,imm); } cmp(Operand dst,int imm)2014 void Assembler::cmp(Operand dst, int imm) { this->op(0x01,0b111, dst,imm); } 2015 2016 // These don't work quite like the other instructions with immediates: 2017 // these immediates are always fixed size at 4 bytes or 1 byte. mov(Operand dst,int imm)2018 void Assembler::mov(Operand dst, int imm) { 2019 this->op(0xC7,dst,(GP64)0b000); 2020 this->word(imm); 2021 } movb(Operand dst,int imm)2022 void Assembler::movb(Operand dst, int imm) { 2023 this->op(0xC6,dst,(GP64)0b000); 2024 this->byte(imm); 2025 } 2026 add(Operand dst,GP64 x)2027 void Assembler::add (Operand dst, GP64 x) { this->op(0x01, dst,x); } sub(Operand dst,GP64 x)2028 void Assembler::sub (Operand dst, GP64 x) { this->op(0x29, dst,x); } cmp(Operand dst,GP64 x)2029 void Assembler::cmp (Operand dst, GP64 x) { this->op(0x39, dst,x); } mov(Operand dst,GP64 x)2030 void Assembler::mov (Operand dst, GP64 x) { this->op(0x89, dst,x); } movb(Operand dst,GP64 x)2031 void Assembler::movb(Operand dst, GP64 x) { this->op(0x88, dst,x); } 2032 add(GP64 dst,Operand x)2033 void Assembler::add (GP64 dst, Operand x) { this->op(0x03, x,dst); } sub(GP64 dst,Operand x)2034 void Assembler::sub (GP64 dst, Operand x) { this->op(0x2B, x,dst); } cmp(GP64 dst,Operand x)2035 void Assembler::cmp (GP64 dst, Operand x) { this->op(0x3B, x,dst); } mov(GP64 dst,Operand x)2036 void Assembler::mov (GP64 dst, Operand x) { this->op(0x8B, x,dst); } movb(GP64 dst,Operand x)2037 void Assembler::movb(GP64 dst, Operand x) { this->op(0x8A, x,dst); } 2038 movzbq(GP64 dst,Operand x)2039 void Assembler::movzbq(GP64 dst, Operand x) { this->op(0xB60F, x,dst); } movzwq(GP64 dst,Operand x)2040 void Assembler::movzwq(GP64 dst, Operand x) { this->op(0xB70F, x,dst); } 2041 vpaddd(Ymm dst,Ymm x,Operand y)2042 void Assembler::vpaddd (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfe, dst,x,y); } vpsubd(Ymm dst,Ymm x,Operand y)2043 void Assembler::vpsubd (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfa, dst,x,y); } vpmulld(Ymm dst,Ymm x,Operand y)2044 void Assembler::vpmulld(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x40, dst,x,y); } 2045 vpaddw(Ymm dst,Ymm x,Operand y)2046 void Assembler::vpaddw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfd, dst,x,y); } vpsubw(Ymm dst,Ymm x,Operand y)2047 void Assembler::vpsubw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xf9, dst,x,y); } vpmullw(Ymm dst,Ymm x,Operand y)2048 void Assembler::vpmullw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xd5, dst,x,y); } vpavgw(Ymm dst,Ymm x,Operand y)2049 void Assembler::vpavgw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xe3, dst,x,y); } vpmulhrsw(Ymm dst,Ymm x,Operand y)2050 void Assembler::vpmulhrsw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x0b, dst,x,y); } vpminsw(Ymm dst,Ymm x,Operand y)2051 void Assembler::vpminsw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xea, dst,x,y); } vpmaxsw(Ymm dst,Ymm x,Operand y)2052 void Assembler::vpmaxsw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xee, dst,x,y); } vpminuw(Ymm dst,Ymm x,Operand y)2053 void Assembler::vpminuw (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3a, dst,x,y); } vpmaxuw(Ymm dst,Ymm x,Operand y)2054 void Assembler::vpmaxuw (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3e, dst,x,y); } 2055 vpabsw(Ymm dst,Operand x)2056 void Assembler::vpabsw(Ymm dst, Operand x) { this->op(0x66,0x380f,0x1d, dst,x); } 2057 2058 vpand(Ymm dst,Ymm x,Operand y)2059 void Assembler::vpand (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdb, dst,x,y); } vpor(Ymm dst,Ymm x,Operand y)2060 void Assembler::vpor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xeb, dst,x,y); } vpxor(Ymm dst,Ymm x,Operand y)2061 void Assembler::vpxor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xef, dst,x,y); } vpandn(Ymm dst,Ymm x,Operand y)2062 void Assembler::vpandn(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdf, dst,x,y); } 2063 vaddps(Ymm dst,Ymm x,Operand y)2064 void Assembler::vaddps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x58, dst,x,y); } vsubps(Ymm dst,Ymm x,Operand y)2065 void Assembler::vsubps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5c, dst,x,y); } vmulps(Ymm dst,Ymm x,Operand y)2066 void Assembler::vmulps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x59, dst,x,y); } vdivps(Ymm dst,Ymm x,Operand y)2067 void Assembler::vdivps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5e, dst,x,y); } vminps(Ymm dst,Ymm x,Operand y)2068 void Assembler::vminps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5d, dst,x,y); } vmaxps(Ymm dst,Ymm x,Operand y)2069 void Assembler::vmaxps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5f, dst,x,y); } 2070 vfmadd132ps(Ymm dst,Ymm x,Operand y)2071 void Assembler::vfmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x98, dst,x,y); } vfmadd213ps(Ymm dst,Ymm x,Operand y)2072 void Assembler::vfmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xa8, dst,x,y); } vfmadd231ps(Ymm dst,Ymm x,Operand y)2073 void Assembler::vfmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xb8, dst,x,y); } 2074 vfmsub132ps(Ymm dst,Ymm x,Operand y)2075 void Assembler::vfmsub132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9a, dst,x,y); } vfmsub213ps(Ymm dst,Ymm x,Operand y)2076 void Assembler::vfmsub213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xaa, dst,x,y); } vfmsub231ps(Ymm dst,Ymm x,Operand y)2077 void Assembler::vfmsub231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xba, dst,x,y); } 2078 vfnmadd132ps(Ymm dst,Ymm x,Operand y)2079 void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9c, dst,x,y); } vfnmadd213ps(Ymm dst,Ymm x,Operand y)2080 void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xac, dst,x,y); } vfnmadd231ps(Ymm dst,Ymm x,Operand y)2081 void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xbc, dst,x,y); } 2082 vpackusdw(Ymm dst,Ymm x,Operand y)2083 void Assembler::vpackusdw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x2b, dst,x,y); } vpackuswb(Ymm dst,Ymm x,Operand y)2084 void Assembler::vpackuswb(Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0x67, dst,x,y); } 2085 vpunpckldq(Ymm dst,Ymm x,Operand y)2086 void Assembler::vpunpckldq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x62, dst,x,y); } vpunpckhdq(Ymm dst,Ymm x,Operand y)2087 void Assembler::vpunpckhdq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x6a, dst,x,y); } 2088 vpcmpeqd(Ymm dst,Ymm x,Operand y)2089 void Assembler::vpcmpeqd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x76, dst,x,y); } vpcmpeqw(Ymm dst,Ymm x,Operand y)2090 void Assembler::vpcmpeqw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x75, dst,x,y); } vpcmpgtd(Ymm dst,Ymm x,Operand y)2091 void Assembler::vpcmpgtd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x66, dst,x,y); } vpcmpgtw(Ymm dst,Ymm x,Operand y)2092 void Assembler::vpcmpgtw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x65, dst,x,y); } 2093 2094 imm_byte_after_operand(const Operand & operand,int imm)2095 void Assembler::imm_byte_after_operand(const Operand& operand, int imm) { 2096 // When we've embedded a label displacement in the middle of an instruction, 2097 // we need to tweak it a little so that the resolved displacement starts 2098 // from the end of the instruction and not the end of the displacement. 2099 if (operand.kind == Operand::LABEL && fCode) { 2100 int disp; 2101 memcpy(&disp, fCode+fSize-4, 4); 2102 disp--; 2103 memcpy(fCode+fSize-4, &disp, 4); 2104 } 2105 this->byte(imm); 2106 } 2107 vcmpps(Ymm dst,Ymm x,Operand y,int imm)2108 void Assembler::vcmpps(Ymm dst, Ymm x, Operand y, int imm) { 2109 this->op(0,0x0f,0xc2, dst,x,y); 2110 this->imm_byte_after_operand(y, imm); 2111 } 2112 vpblendvb(Ymm dst,Ymm x,Operand y,Ymm z)2113 void Assembler::vpblendvb(Ymm dst, Ymm x, Operand y, Ymm z) { 2114 this->op(0x66,0x3a0f,0x4c, dst,x,y); 2115 this->imm_byte_after_operand(y, z << 4); 2116 } 2117 2118 // Shift instructions encode their opcode extension as "dst", dst as x, and x as y. vpslld(Ymm dst,Ymm x,int imm)2119 void Assembler::vpslld(Ymm dst, Ymm x, int imm) { 2120 this->op(0x66,0x0f,0x72,(Ymm)6, dst,x); 2121 this->byte(imm); 2122 } vpsrld(Ymm dst,Ymm x,int imm)2123 void Assembler::vpsrld(Ymm dst, Ymm x, int imm) { 2124 this->op(0x66,0x0f,0x72,(Ymm)2, dst,x); 2125 this->byte(imm); 2126 } vpsrad(Ymm dst,Ymm x,int imm)2127 void Assembler::vpsrad(Ymm dst, Ymm x, int imm) { 2128 this->op(0x66,0x0f,0x72,(Ymm)4, dst,x); 2129 this->byte(imm); 2130 } vpsllw(Ymm dst,Ymm x,int imm)2131 void Assembler::vpsllw(Ymm dst, Ymm x, int imm) { 2132 this->op(0x66,0x0f,0x71,(Ymm)6, dst,x); 2133 this->byte(imm); 2134 } vpsrlw(Ymm dst,Ymm x,int imm)2135 void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) { 2136 this->op(0x66,0x0f,0x71,(Ymm)2, dst,x); 2137 this->byte(imm); 2138 } vpsraw(Ymm dst,Ymm x,int imm)2139 void Assembler::vpsraw(Ymm dst, Ymm x, int imm) { 2140 this->op(0x66,0x0f,0x71,(Ymm)4, dst,x); 2141 this->byte(imm); 2142 } 2143 vpermq(Ymm dst,Operand x,int imm)2144 void Assembler::vpermq(Ymm dst, Operand x, int imm) { 2145 // A bit unusual among the instructions we use, this is 64-bit operation, so we set W. 2146 this->op(0x66,0x3a0f,0x00, dst,x,W1); 2147 this->imm_byte_after_operand(x, imm); 2148 } 2149 vperm2f128(Ymm dst,Ymm x,Operand y,int imm)2150 void Assembler::vperm2f128(Ymm dst, Ymm x, Operand y, int imm) { 2151 this->op(0x66,0x3a0f,0x06, dst,x,y); 2152 this->imm_byte_after_operand(y, imm); 2153 } 2154 vpermps(Ymm dst,Ymm ix,Operand src)2155 void Assembler::vpermps(Ymm dst, Ymm ix, Operand src) { 2156 this->op(0x66,0x380f,0x16, dst,ix,src); 2157 } 2158 vroundps(Ymm dst,Operand x,Rounding imm)2159 void Assembler::vroundps(Ymm dst, Operand x, Rounding imm) { 2160 this->op(0x66,0x3a0f,0x08, dst,x); 2161 this->imm_byte_after_operand(x, imm); 2162 } 2163 vmovdqa(Ymm dst,Operand src)2164 void Assembler::vmovdqa(Ymm dst, Operand src) { this->op(0x66,0x0f,0x6f, dst,src); } vmovups(Ymm dst,Operand src)2165 void Assembler::vmovups(Ymm dst, Operand src) { this->op( 0,0x0f,0x10, dst,src); } vmovups(Xmm dst,Operand src)2166 void Assembler::vmovups(Xmm dst, Operand src) { this->op( 0,0x0f,0x10, dst,src); } vmovups(Operand dst,Ymm src)2167 void Assembler::vmovups(Operand dst, Ymm src) { this->op( 0,0x0f,0x11, src,dst); } vmovups(Operand dst,Xmm src)2168 void Assembler::vmovups(Operand dst, Xmm src) { this->op( 0,0x0f,0x11, src,dst); } 2169 vcvtdq2ps(Ymm dst,Operand x)2170 void Assembler::vcvtdq2ps (Ymm dst, Operand x) { this->op( 0,0x0f,0x5b, dst,x); } vcvttps2dq(Ymm dst,Operand x)2171 void Assembler::vcvttps2dq(Ymm dst, Operand x) { this->op(0xf3,0x0f,0x5b, dst,x); } vcvtps2dq(Ymm dst,Operand x)2172 void Assembler::vcvtps2dq (Ymm dst, Operand x) { this->op(0x66,0x0f,0x5b, dst,x); } vsqrtps(Ymm dst,Operand x)2173 void Assembler::vsqrtps (Ymm dst, Operand x) { this->op( 0,0x0f,0x51, dst,x); } 2174 vcvtps2ph(Operand dst,Ymm x,Rounding imm)2175 void Assembler::vcvtps2ph(Operand dst, Ymm x, Rounding imm) { 2176 this->op(0x66,0x3a0f,0x1d, x,dst); 2177 this->imm_byte_after_operand(dst, imm); 2178 } vcvtph2ps(Ymm dst,Operand x)2179 void Assembler::vcvtph2ps(Ymm dst, Operand x) { 2180 this->op(0x66,0x380f,0x13, dst,x); 2181 } 2182 disp19(Label * l)2183 int Assembler::disp19(Label* l) { 2184 SkASSERT(l->kind == Label::NotYetSet || 2185 l->kind == Label::ARMDisp19); 2186 int here = (int)this->size(); 2187 l->kind = Label::ARMDisp19; 2188 l->references.push_back(here); 2189 // ARM 19-bit instruction count, from the beginning of this instruction. 2190 return (l->offset - here) / 4; 2191 } 2192 disp32(Label * l)2193 int Assembler::disp32(Label* l) { 2194 SkASSERT(l->kind == Label::NotYetSet || 2195 l->kind == Label::X86Disp32); 2196 int here = (int)this->size(); 2197 l->kind = Label::X86Disp32; 2198 l->references.push_back(here); 2199 // x86 32-bit byte count, from the end of this instruction. 2200 return l->offset - (here + 4); 2201 } 2202 op(int prefix,int map,int opcode,int dst,int x,Operand y,W w,L l)2203 void Assembler::op(int prefix, int map, int opcode, int dst, int x, Operand y, W w, L l) { 2204 switch (y.kind) { 2205 case Operand::REG: { 2206 VEX v = vex(w, dst>>3, 0, y.reg>>3, 2207 map, x, l, prefix); 2208 this->bytes(v.bytes, v.len); 2209 this->byte(opcode); 2210 this->byte(mod_rm(Mod::Direct, dst&7, y.reg&7)); 2211 } return; 2212 2213 case Operand::MEM: { 2214 // Passing rsp as the rm argument to mod_rm() signals an SIB byte follows; 2215 // without an SIB byte, that's where the base register would usually go. 2216 // This means we have to use an SIB byte if we want to use rsp as a base register. 2217 const Mem& m = y.mem; 2218 const bool need_SIB = m.base == rsp 2219 || m.index != rsp; 2220 2221 VEX v = vex(w, dst>>3, m.index>>3, m.base>>3, 2222 map, x, l, prefix); 2223 this->bytes(v.bytes, v.len); 2224 this->byte(opcode); 2225 this->byte(mod_rm(mod(m.disp), dst&7, (need_SIB ? rsp : m.base)&7)); 2226 if (need_SIB) { 2227 this->byte(sib(m.scale, m.index&7, m.base&7)); 2228 } 2229 this->bytes(&m.disp, imm_bytes(mod(m.disp))); 2230 } return; 2231 2232 case Operand::LABEL: { 2233 // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13. 2234 const int rip = rbp; 2235 2236 VEX v = vex(w, dst>>3, 0, rip>>3, 2237 map, x, l, prefix); 2238 this->bytes(v.bytes, v.len); 2239 this->byte(opcode); 2240 this->byte(mod_rm(Mod::Indirect, dst&7, rip&7)); 2241 this->word(this->disp32(y.label)); 2242 } return; 2243 } 2244 } 2245 vpshufb(Ymm dst,Ymm x,Operand y)2246 void Assembler::vpshufb(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x00, dst,x,y); } 2247 vptest(Ymm x,Operand y)2248 void Assembler::vptest(Ymm x, Operand y) { this->op(0x66, 0x380f, 0x17, x,y); } 2249 vbroadcastss(Ymm dst,Operand y)2250 void Assembler::vbroadcastss(Ymm dst, Operand y) { this->op(0x66,0x380f,0x18, dst,y); } 2251 jump(uint8_t condition,Label * l)2252 void Assembler::jump(uint8_t condition, Label* l) { 2253 // These conditional jumps can be either 2 bytes (short) or 6 bytes (near): 2254 // 7? one-byte-disp 2255 // 0F 8? four-byte-disp 2256 // We always use the near displacement to make updating labels simpler (no resizing). 2257 this->byte(0x0f); 2258 this->byte(condition); 2259 this->word(this->disp32(l)); 2260 } je(Label * l)2261 void Assembler::je (Label* l) { this->jump(0x84, l); } jne(Label * l)2262 void Assembler::jne(Label* l) { this->jump(0x85, l); } jl(Label * l)2263 void Assembler::jl (Label* l) { this->jump(0x8c, l); } jc(Label * l)2264 void Assembler::jc (Label* l) { this->jump(0x82, l); } 2265 jmp(Label * l)2266 void Assembler::jmp(Label* l) { 2267 // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit. 2268 this->byte(0xe9); 2269 this->word(this->disp32(l)); 2270 } 2271 vpmovzxwd(Ymm dst,Operand src)2272 void Assembler::vpmovzxwd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x33, dst,src); } vpmovzxbd(Ymm dst,Operand src)2273 void Assembler::vpmovzxbd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x31, dst,src); } 2274 vmovq(Operand dst,Xmm src)2275 void Assembler::vmovq(Operand dst, Xmm src) { this->op(0x66,0x0f,0xd6, src,dst); } 2276 vmovd(Operand dst,Xmm src)2277 void Assembler::vmovd(Operand dst, Xmm src) { this->op(0x66,0x0f,0x7e, src,dst); } vmovd(Xmm dst,Operand src)2278 void Assembler::vmovd(Xmm dst, Operand src) { this->op(0x66,0x0f,0x6e, dst,src); } 2279 vpinsrd(Xmm dst,Xmm src,Operand y,int imm)2280 void Assembler::vpinsrd(Xmm dst, Xmm src, Operand y, int imm) { 2281 this->op(0x66,0x3a0f,0x22, dst,src,y); 2282 this->imm_byte_after_operand(y, imm); 2283 } vpinsrw(Xmm dst,Xmm src,Operand y,int imm)2284 void Assembler::vpinsrw(Xmm dst, Xmm src, Operand y, int imm) { 2285 this->op(0x66,0x0f,0xc4, dst,src,y); 2286 this->imm_byte_after_operand(y, imm); 2287 } vpinsrb(Xmm dst,Xmm src,Operand y,int imm)2288 void Assembler::vpinsrb(Xmm dst, Xmm src, Operand y, int imm) { 2289 this->op(0x66,0x3a0f,0x20, dst,src,y); 2290 this->imm_byte_after_operand(y, imm); 2291 } 2292 vextracti128(Operand dst,Ymm src,int imm)2293 void Assembler::vextracti128(Operand dst, Ymm src, int imm) { 2294 this->op(0x66,0x3a0f,0x39, src,dst); 2295 SkASSERT(dst.kind != Operand::LABEL); 2296 this->byte(imm); 2297 } vpextrd(Operand dst,Xmm src,int imm)2298 void Assembler::vpextrd(Operand dst, Xmm src, int imm) { 2299 this->op(0x66,0x3a0f,0x16, src,dst); 2300 SkASSERT(dst.kind != Operand::LABEL); 2301 this->byte(imm); 2302 } vpextrw(Operand dst,Xmm src,int imm)2303 void Assembler::vpextrw(Operand dst, Xmm src, int imm) { 2304 this->op(0x66,0x3a0f,0x15, src,dst); 2305 SkASSERT(dst.kind != Operand::LABEL); 2306 this->byte(imm); 2307 } vpextrb(Operand dst,Xmm src,int imm)2308 void Assembler::vpextrb(Operand dst, Xmm src, int imm) { 2309 this->op(0x66,0x3a0f,0x14, src,dst); 2310 SkASSERT(dst.kind != Operand::LABEL); 2311 this->byte(imm); 2312 } 2313 vgatherdps(Ymm dst,Scale scale,Ymm ix,GP64 base,Ymm mask)2314 void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) { 2315 // Unlike most instructions, no aliasing is permitted here. 2316 SkASSERT(dst != ix); 2317 SkASSERT(dst != mask); 2318 SkASSERT(mask != ix); 2319 2320 int prefix = 0x66, 2321 map = 0x380f, 2322 opcode = 0x92; 2323 VEX v = vex(0, dst>>3, ix>>3, base>>3, 2324 map, mask, /*ymm?*/1, prefix); 2325 this->bytes(v.bytes, v.len); 2326 this->byte(opcode); 2327 this->byte(mod_rm(Mod::Indirect, dst&7, rsp/*use SIB*/)); 2328 this->byte(sib(scale, ix&7, base&7)); 2329 } 2330 2331 // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf 2332 mask(unsigned long long bits)2333 static int mask(unsigned long long bits) { return (1<<(int)bits)-1; } 2334 op(uint32_t hi,V m,uint32_t lo,V n,V d)2335 void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) { 2336 this->word( (hi & mask(11)) << 21 2337 | (m & mask(5)) << 16 2338 | (lo & mask(6)) << 10 2339 | (n & mask(5)) << 5 2340 | (d & mask(5)) << 0); 2341 } op(uint32_t op22,V n,V d,int imm)2342 void Assembler::op(uint32_t op22, V n, V d, int imm) { 2343 this->word( (op22 & mask(22)) << 10 2344 | imm // size and location depends on the instruction 2345 | (n & mask(5)) << 5 2346 | (d & mask(5)) << 0); 2347 } 2348 and16b(V d,V n,V m)2349 void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); } orr16b(V d,V n,V m)2350 void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); } eor16b(V d,V n,V m)2351 void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); } bic16b(V d,V n,V m)2352 void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); } bsl16b(V d,V n,V m)2353 void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); } not16b(V d,V n)2354 void Assembler::not16b(V d, V n) { this->op(0b0'1'1'01110'00'10000'00101'10, n, d); } 2355 add4s(V d,V n,V m)2356 void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); } sub4s(V d,V n,V m)2357 void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); } mul4s(V d,V n,V m)2358 void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); } 2359 cmeq4s(V d,V n,V m)2360 void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); } cmgt4s(V d,V n,V m)2361 void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); } 2362 sub8h(V d,V n,V m)2363 void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); } mul8h(V d,V n,V m)2364 void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); } 2365 fadd4s(V d,V n,V m)2366 void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); } fsub4s(V d,V n,V m)2367 void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); } fmul4s(V d,V n,V m)2368 void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); } fdiv4s(V d,V n,V m)2369 void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); } fmin4s(V d,V n,V m)2370 void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); } fmax4s(V d,V n,V m)2371 void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); } 2372 fneg4s(V d,V n)2373 void Assembler::fneg4s (V d, V n) { this->op(0b0'1'1'01110'1'0'10000'01111'10, n,d); } fsqrt4s(V d,V n)2374 void Assembler::fsqrt4s(V d, V n) { this->op(0b0'1'1'01110'1'0'10000'11111'10, n,d); } 2375 fcmeq4s(V d,V n,V m)2376 void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); } fcmgt4s(V d,V n,V m)2377 void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); } fcmge4s(V d,V n,V m)2378 void Assembler::fcmge4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b1110'0'1, n, d); } 2379 fmla4s(V d,V n,V m)2380 void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); } fmls4s(V d,V n,V m)2381 void Assembler::fmls4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11001'1, n, d); } 2382 tbl(V d,V n,V m)2383 void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); } 2384 uzp14s(V d,V n,V m)2385 void Assembler::uzp14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'01'10, n, d); } uzp24s(V d,V n,V m)2386 void Assembler::uzp24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'01'10, n, d); } zip14s(V d,V n,V m)2387 void Assembler::zip14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'11'10, n, d); } zip24s(V d,V n,V m)2388 void Assembler::zip24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'11'10, n, d); } 2389 sli4s(V d,V n,int imm5)2390 void Assembler::sli4s(V d, V n, int imm5) { 2391 this->op(0b0'1'1'011110'0100'000'01010'1, n, d, ( imm5 & mask(5))<<16); 2392 } shl4s(V d,V n,int imm5)2393 void Assembler::shl4s(V d, V n, int imm5) { 2394 this->op(0b0'1'0'011110'0100'000'01010'1, n, d, ( imm5 & mask(5))<<16); 2395 } sshr4s(V d,V n,int imm5)2396 void Assembler::sshr4s(V d, V n, int imm5) { 2397 this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & mask(5))<<16); 2398 } ushr4s(V d,V n,int imm5)2399 void Assembler::ushr4s(V d, V n, int imm5) { 2400 this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & mask(5))<<16); 2401 } ushr8h(V d,V n,int imm4)2402 void Assembler::ushr8h(V d, V n, int imm4) { 2403 this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, n, d, (-imm4 & mask(4))<<16); 2404 } 2405 scvtf4s(V d,V n)2406 void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); } fcvtzs4s(V d,V n)2407 void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); } fcvtns4s(V d,V n)2408 void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); } frintp4s(V d,V n)2409 void Assembler::frintp4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1100'0'10, n,d); } frintm4s(V d,V n)2410 void Assembler::frintm4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1100'1'10, n,d); } 2411 fcvtn(V d,V n)2412 void Assembler::fcvtn(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10110'10, n,d); } fcvtl(V d,V n)2413 void Assembler::fcvtl(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10111'10, n,d); } 2414 xtns2h(V d,V n)2415 void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); } xtnh2b(V d,V n)2416 void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); } 2417 uxtlb2h(V d,V n)2418 void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); } uxtlh2s(V d,V n)2419 void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); } 2420 uminv4s(V d,V n)2421 void Assembler::uminv4s(V d, V n) { this->op(0b0'1'1'01110'10'11000'1'1010'10, n,d); } 2422 brk(int imm16)2423 void Assembler::brk(int imm16) { 2424 this->op(0b11010100'001'00000000000, (imm16 & mask(16)) << 5); 2425 } 2426 ret(X n)2427 void Assembler::ret(X n) { this->op(0b1101011'0'0'10'11111'0000'0'0, n, (X)0); } 2428 add(X d,X n,int imm12)2429 void Assembler::add(X d, X n, int imm12) { 2430 this->op(0b1'0'0'10001'00'000000000000, n,d, (imm12 & mask(12)) << 10); 2431 } sub(X d,X n,int imm12)2432 void Assembler::sub(X d, X n, int imm12) { 2433 this->op(0b1'1'0'10001'00'000000000000, n,d, (imm12 & mask(12)) << 10); 2434 } subs(X d,X n,int imm12)2435 void Assembler::subs(X d, X n, int imm12) { 2436 this->op(0b1'1'1'10001'00'000000000000, n,d, (imm12 & mask(12)) << 10); 2437 } 2438 add(X d,X n,X m,Shift shift,int imm6)2439 void Assembler::add(X d, X n, X m, Shift shift, int imm6) { 2440 SkASSERT(shift != ROR); 2441 2442 int imm = (imm6 & mask(6)) << 0 2443 | (m & mask(5)) << 6 2444 | (0 & mask(1)) << 11 2445 | (shift & mask(2)) << 12; 2446 this->op(0b1'0'0'01011'00'0'00000'000000, n,d, imm << 10); 2447 } 2448 b(Condition cond,Label * l)2449 void Assembler::b(Condition cond, Label* l) { 2450 const int imm19 = this->disp19(l); 2451 this->op(0b0101010'0'00000000000000, (X)0, (V)cond, (imm19 & mask(19)) << 5); 2452 } cbz(X t,Label * l)2453 void Assembler::cbz(X t, Label* l) { 2454 const int imm19 = this->disp19(l); 2455 this->op(0b1'011010'0'00000000000000, (X)0, t, (imm19 & mask(19)) << 5); 2456 } cbnz(X t,Label * l)2457 void Assembler::cbnz(X t, Label* l) { 2458 const int imm19 = this->disp19(l); 2459 this->op(0b1'011010'1'00000000000000, (X)0, t, (imm19 & mask(19)) << 5); 2460 } 2461 ldrd(X dst,X src,int imm12)2462 void Assembler::ldrd(X dst, X src, int imm12) { 2463 this->op(0b11'111'0'01'01'000000000000, src, dst, (imm12 & mask(12)) << 10); 2464 } ldrs(X dst,X src,int imm12)2465 void Assembler::ldrs(X dst, X src, int imm12) { 2466 this->op(0b10'111'0'01'01'000000000000, src, dst, (imm12 & mask(12)) << 10); 2467 } ldrh(X dst,X src,int imm12)2468 void Assembler::ldrh(X dst, X src, int imm12) { 2469 this->op(0b01'111'0'01'01'000000000000, src, dst, (imm12 & mask(12)) << 10); 2470 } ldrb(X dst,X src,int imm12)2471 void Assembler::ldrb(X dst, X src, int imm12) { 2472 this->op(0b00'111'0'01'01'000000000000, src, dst, (imm12 & mask(12)) << 10); 2473 } 2474 ldrq(V dst,X src,int imm12)2475 void Assembler::ldrq(V dst, X src, int imm12) { 2476 this->op(0b00'111'1'01'11'000000000000, src, dst, (imm12 & mask(12)) << 10); 2477 } ldrd(V dst,X src,int imm12)2478 void Assembler::ldrd(V dst, X src, int imm12) { 2479 this->op(0b11'111'1'01'01'000000000000, src, dst, (imm12 & mask(12)) << 10); 2480 } ldrs(V dst,X src,int imm12)2481 void Assembler::ldrs(V dst, X src, int imm12) { 2482 this->op(0b10'111'1'01'01'000000000000, src, dst, (imm12 & mask(12)) << 10); 2483 } ldrh(V dst,X src,int imm12)2484 void Assembler::ldrh(V dst, X src, int imm12) { 2485 this->op(0b01'111'1'01'01'000000000000, src, dst, (imm12 & mask(12)) << 10); 2486 } ldrb(V dst,X src,int imm12)2487 void Assembler::ldrb(V dst, X src, int imm12) { 2488 this->op(0b00'111'1'01'01'000000000000, src, dst, (imm12 & mask(12)) << 10); 2489 } 2490 strs(X src,X dst,int imm12)2491 void Assembler::strs(X src, X dst, int imm12) { 2492 this->op(0b10'111'0'01'00'000000000000, dst, src, (imm12 & mask(12)) << 10); 2493 } 2494 strq(V src,X dst,int imm12)2495 void Assembler::strq(V src, X dst, int imm12) { 2496 this->op(0b00'111'1'01'10'000000000000, dst, src, (imm12 & mask(12)) << 10); 2497 } strd(V src,X dst,int imm12)2498 void Assembler::strd(V src, X dst, int imm12) { 2499 this->op(0b11'111'1'01'00'000000000000, dst, src, (imm12 & mask(12)) << 10); 2500 } strs(V src,X dst,int imm12)2501 void Assembler::strs(V src, X dst, int imm12) { 2502 this->op(0b10'111'1'01'00'000000000000, dst, src, (imm12 & mask(12)) << 10); 2503 } strh(V src,X dst,int imm12)2504 void Assembler::strh(V src, X dst, int imm12) { 2505 this->op(0b01'111'1'01'00'000000000000, dst, src, (imm12 & mask(12)) << 10); 2506 } strb(V src,X dst,int imm12)2507 void Assembler::strb(V src, X dst, int imm12) { 2508 this->op(0b00'111'1'01'00'000000000000, dst, src, (imm12 & mask(12)) << 10); 2509 } 2510 movs(X dst,V src,int lane)2511 void Assembler::movs(X dst, V src, int lane) { 2512 int imm5 = (lane << 3) | 0b100; 2513 this->op(0b0'0'0'01110000'00000'0'01'1'1'1, src, dst, (imm5 & mask(5)) << 16); 2514 } inss(V dst,X src,int lane)2515 void Assembler::inss(V dst, X src, int lane) { 2516 int imm5 = (lane << 3) | 0b100; 2517 this->op(0b0'1'0'01110000'00000'0'0011'1, src, dst, (imm5 & mask(5)) << 16); 2518 } 2519 2520 ldrq(V dst,Label * l)2521 void Assembler::ldrq(V dst, Label* l) { 2522 const int imm19 = this->disp19(l); 2523 this->op(0b10'011'1'00'00000000000000, (V)0, dst, (imm19 & mask(19)) << 5); 2524 } 2525 dup4s(V dst,X src)2526 void Assembler::dup4s(V dst, X src) { 2527 this->op(0b0'1'0'01110000'00100'0'0001'1, src, dst); 2528 } 2529 ld1r4s(V dst,X src)2530 void Assembler::ld1r4s(V dst, X src) { 2531 this->op(0b0'1'0011010'1'0'00000'110'0'10, src, dst); 2532 } ld1r8h(V dst,X src)2533 void Assembler::ld1r8h(V dst, X src) { 2534 this->op(0b0'1'0011010'1'0'00000'110'0'01, src, dst); 2535 } ld1r16b(V dst,X src)2536 void Assembler::ld1r16b(V dst, X src) { 2537 this->op(0b0'1'0011010'1'0'00000'110'0'00, src, dst); 2538 } 2539 ld24s(V dst,X src)2540 void Assembler::ld24s(V dst, X src) { this->op(0b0'1'0011000'1'000000'1000'10, src, dst); } ld44s(V dst,X src)2541 void Assembler::ld44s(V dst, X src) { this->op(0b0'1'0011000'1'000000'0000'10, src, dst); } st24s(V src,X dst)2542 void Assembler::st24s(V src, X dst) { this->op(0b0'1'0011000'0'000000'1000'10, dst, src); } st44s(V src,X dst)2543 void Assembler::st44s(V src, X dst) { this->op(0b0'1'0011000'0'000000'0000'10, dst, src); } 2544 ld24s(V dst,X src,int lane)2545 void Assembler::ld24s(V dst, X src, int lane) { 2546 int Q = (lane & 2)>>1, 2547 S = (lane & 1); 2548 /* Q S */ 2549 this->op(0b0'0'0011010'1'1'00000'100'0'00, src, dst, (Q<<30)|(S<<12)); 2550 } ld44s(V dst,X src,int lane)2551 void Assembler::ld44s(V dst, X src, int lane) { 2552 int Q = (lane & 2)>>1, 2553 S = (lane & 1); 2554 this->op(0b0'0'0011010'1'1'00000'101'0'00, src, dst, (Q<<30)|(S<<12)); 2555 } 2556 label(Label * l)2557 void Assembler::label(Label* l) { 2558 if (fCode) { 2559 // The instructions all currently point to l->offset. 2560 // We'll want to add a delta to point them to here. 2561 int here = (int)this->size(); 2562 int delta = here - l->offset; 2563 l->offset = here; 2564 2565 if (l->kind == Label::ARMDisp19) { 2566 for (int ref : l->references) { 2567 // ref points to a 32-bit instruction with 19-bit displacement in instructions. 2568 uint32_t inst; 2569 memcpy(&inst, fCode + ref, 4); 2570 2571 // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ] 2572 int disp = (int)(inst << 8) >> 13; 2573 2574 disp += delta/4; // delta is in bytes, we want instructions. 2575 2576 // Put it all back together, preserving the high 8 bits and low 5. 2577 inst = ((disp << 5) & (mask(19) << 5)) 2578 | ((inst ) & ~(mask(19) << 5)); 2579 memcpy(fCode + ref, &inst, 4); 2580 } 2581 } 2582 2583 if (l->kind == Label::X86Disp32) { 2584 for (int ref : l->references) { 2585 // ref points to a 32-bit displacement in bytes. 2586 int disp; 2587 memcpy(&disp, fCode + ref, 4); 2588 2589 disp += delta; 2590 2591 memcpy(fCode + ref, &disp, 4); 2592 } 2593 } 2594 } 2595 } 2596 eval(int n,void * args[]) const2597 void Program::eval(int n, void* args[]) const { 2598 #define SKVM_JIT_STATS 0 2599 #if SKVM_JIT_STATS 2600 static std::atomic<int64_t> calls{0}, jits{0}, 2601 pixels{0}, fast{0}; 2602 pixels += n; 2603 if (0 == calls++) { 2604 atexit([]{ 2605 int64_t num = jits .load(), 2606 den = calls.load(); 2607 SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n", (100.0 * num)/den, den); 2608 num = fast .load(); 2609 den = pixels.load(); 2610 SkDebugf("%.3g%% of %lld pixels went through JIT.\n", (100.0 * num)/den, den); 2611 }); 2612 } 2613 #endif 2614 2615 #if !defined(SKVM_JIT_BUT_IGNORE_IT) 2616 const void* jit_entry = fImpl->jit_entry.load(); 2617 // jit_entry may be null if we can't JIT 2618 // 2619 // Ordinarily we'd never find ourselves with non-null jit_entry and !gSkVMAllowJIT, but it 2620 // can happen during interactive programs like Viewer that toggle gSkVMAllowJIT on and off, 2621 // due to timing or program caching. 2622 if (jit_entry != nullptr && gSkVMAllowJIT) { 2623 #if SKVM_JIT_STATS 2624 jits++; 2625 fast += n; 2626 #endif 2627 void** a = args; 2628 switch (fImpl->strides.size()) { 2629 case 0: return ((void(*)(int ))jit_entry)(n ); 2630 case 1: return ((void(*)(int,void* ))jit_entry)(n,a[0] ); 2631 case 2: return ((void(*)(int,void*,void* ))jit_entry)(n,a[0],a[1] ); 2632 case 3: return ((void(*)(int,void*,void*,void* ))jit_entry)(n,a[0],a[1],a[2]); 2633 case 4: return ((void(*)(int,void*,void*,void*,void*))jit_entry) 2634 (n,a[0],a[1],a[2],a[3]); 2635 case 5: return ((void(*)(int,void*,void*,void*,void*,void*))jit_entry) 2636 (n,a[0],a[1],a[2],a[3],a[4]); 2637 case 6: return ((void(*)(int,void*,void*,void*,void*,void*,void*))jit_entry) 2638 (n,a[0],a[1],a[2],a[3],a[4],a[5]); 2639 case 7: return ((void(*)(int,void*,void*,void*,void*,void*,void*,void*))jit_entry) 2640 (n,a[0],a[1],a[2],a[3],a[4],a[5],a[6]); 2641 default: break; //SkASSERT(fImpl->strides.size() <= 7); 2642 } 2643 } 2644 #endif 2645 2646 // So we'll sometimes use the interpreter here even if later calls will use the JIT. 2647 SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(), 2648 this->nregs(), this->loop(), fImpl->strides.data(), 2649 fImpl->traceHooks.data(), fImpl->traceHooks.size(), 2650 this->nargs(), n, args); 2651 } 2652 hasTraceHooks() const2653 bool Program::hasTraceHooks() const { 2654 // Identifies a program which has been instrumented for debugging. 2655 return !fImpl->traceHooks.empty(); 2656 } 2657 hasJIT() const2658 bool Program::hasJIT() const { 2659 return fImpl->jit_entry.load() != nullptr; 2660 } 2661 dropJIT()2662 void Program::dropJIT() { 2663 #if defined(SKVM_JIT) 2664 if (fImpl->dylib) { 2665 close_dylib(fImpl->dylib); 2666 } else if (auto jit_entry = fImpl->jit_entry.load()) { 2667 unmap_jit_buffer(jit_entry, fImpl->jit_size); 2668 } 2669 #else 2670 SkASSERT(!this->hasJIT()); 2671 #endif 2672 2673 fImpl->jit_entry.store(nullptr); 2674 fImpl->jit_size = 0; 2675 fImpl->dylib = nullptr; 2676 } 2677 Program()2678 Program::Program() : fImpl(std::make_unique<Impl>()) {} 2679 ~Program()2680 Program::~Program() { 2681 // Moved-from Programs may have fImpl == nullptr. 2682 if (fImpl) { 2683 this->dropJIT(); 2684 } 2685 } 2686 Program(Program && other)2687 Program::Program(Program&& other) : fImpl(std::move(other.fImpl)) {} 2688 operator =(Program && other)2689 Program& Program::operator=(Program&& other) { 2690 fImpl = std::move(other.fImpl); 2691 return *this; 2692 } 2693 Program(const std::vector<OptimizedInstruction> & instructions,std::unique_ptr<viz::Visualizer> visualizer,const std::vector<int> & strides,const std::vector<TraceHook * > & traceHooks,const char * debug_name,bool allow_jit)2694 Program::Program(const std::vector<OptimizedInstruction>& instructions, 2695 std::unique_ptr<viz::Visualizer> visualizer, 2696 const std::vector<int>& strides, 2697 const std::vector<TraceHook*>& traceHooks, 2698 const char* debug_name, bool allow_jit) : Program() { 2699 fImpl->visualizer = std::move(visualizer); 2700 fImpl->strides = strides; 2701 fImpl->traceHooks = traceHooks; 2702 if (gSkVMAllowJIT && allow_jit) { 2703 #if defined(SKVM_JIT) 2704 this->setupJIT(instructions, debug_name); 2705 #endif 2706 } 2707 2708 this->setupInterpreter(instructions); 2709 } 2710 instructions() const2711 std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; } nargs() const2712 int Program::nargs() const { return (int)fImpl->strides.size(); } nregs() const2713 int Program::nregs() const { return fImpl->regs; } loop() const2714 int Program::loop () const { return fImpl->loop; } empty() const2715 bool Program::empty() const { return fImpl->instructions.empty(); } 2716 2717 // Translate OptimizedInstructions to InterpreterInstructions. setupInterpreter(const std::vector<OptimizedInstruction> & instructions)2718 void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) { 2719 // Register each instruction is assigned to. 2720 std::vector<Reg> reg(instructions.size()); 2721 2722 // This next bit is a bit more complicated than strictly necessary; 2723 // we could just assign every instruction to its own register. 2724 // 2725 // But recycling registers is fairly cheap, and good practice for the 2726 // JITs where minimizing register pressure really is important. 2727 // 2728 // We have effectively infinite registers, so we hoist any value we can. 2729 // (The JIT may choose a more complex policy to reduce register pressure.) 2730 2731 fImpl->regs = 0; 2732 std::vector<Reg> avail; 2733 2734 // Assign this value to a register, recycling them where we can. 2735 auto assign_register = [&](Val id) { 2736 const OptimizedInstruction& inst = instructions[id]; 2737 2738 // If this is a real input and it's lifetime ends at this instruction, 2739 // we can recycle the register it's occupying. 2740 auto maybe_recycle_register = [&](Val input) { 2741 if (input != NA && instructions[input].death == id) { 2742 avail.push_back(reg[input]); 2743 } 2744 }; 2745 2746 // Take care to not recycle the same register twice. 2747 const Val x = inst.x, y = inst.y, z = inst.z, w = inst.w; 2748 if (true ) { maybe_recycle_register(x); } 2749 if (y != x ) { maybe_recycle_register(y); } 2750 if (z != x && z != y ) { maybe_recycle_register(z); } 2751 if (w != x && w != y && w != z) { maybe_recycle_register(w); } 2752 2753 // Instructions that die at themselves (stores) don't need a register. 2754 if (inst.death != id) { 2755 // Allocate a register if we have to, preferring to reuse anything available. 2756 if (avail.empty()) { 2757 reg[id] = fImpl->regs++; 2758 } else { 2759 reg[id] = avail.back(); 2760 avail.pop_back(); 2761 } 2762 } 2763 }; 2764 2765 // Assign a register to each hoisted instruction, then each non-hoisted loop instruction. 2766 for (Val id = 0; id < (Val)instructions.size(); id++) { 2767 if ( instructions[id].can_hoist) { assign_register(id); } 2768 } 2769 for (Val id = 0; id < (Val)instructions.size(); id++) { 2770 if (!instructions[id].can_hoist) { assign_register(id); } 2771 } 2772 2773 // Translate OptimizedInstructions to InterpreterIstructions by mapping values to 2774 // registers. This will be two passes, first hoisted instructions, then inside the loop. 2775 2776 // The loop begins at the fImpl->loop'th Instruction. 2777 fImpl->loop = 0; 2778 fImpl->instructions.reserve(instructions.size()); 2779 2780 // Add a mapping for the N/A sentinel Val to any arbitrary register 2781 // so lookups don't have to know which arguments are used by which Ops. 2782 auto lookup_register = [&](Val id) { 2783 return id == NA ? (Reg)0 2784 : reg[id]; 2785 }; 2786 2787 auto push_instruction = [&](Val id, const OptimizedInstruction& inst) { 2788 InterpreterInstruction pinst{ 2789 inst.op, 2790 lookup_register(id), 2791 lookup_register(inst.x), 2792 lookup_register(inst.y), 2793 lookup_register(inst.z), 2794 lookup_register(inst.w), 2795 inst.immA, 2796 inst.immB, 2797 inst.immC, 2798 }; 2799 fImpl->instructions.push_back(pinst); 2800 }; 2801 2802 for (Val id = 0; id < (Val)instructions.size(); id++) { 2803 const OptimizedInstruction& inst = instructions[id]; 2804 if (inst.can_hoist) { 2805 push_instruction(id, inst); 2806 fImpl->loop++; 2807 } 2808 } 2809 for (Val id = 0; id < (Val)instructions.size(); id++) { 2810 const OptimizedInstruction& inst = instructions[id]; 2811 if (!inst.can_hoist) { 2812 push_instruction(id, inst); 2813 } 2814 } 2815 } 2816 2817 #if defined(SKVM_JIT) 2818 2819 namespace SkVMJitTypes { 2820 #if defined(__x86_64__) || defined(_M_X64) 2821 using Reg = Assembler::Ymm; 2822 #elif defined(__aarch64__) 2823 using Reg = Assembler::V; 2824 #endif 2825 } // namespace SkVMJitTypes 2826 jit(const std::vector<OptimizedInstruction> & instructions,int * stack_hint,uint32_t * registers_used,Assembler * a) const2827 bool Program::jit(const std::vector<OptimizedInstruction>& instructions, 2828 int* stack_hint, 2829 uint32_t* registers_used, 2830 Assembler* a) const { 2831 using A = Assembler; 2832 using SkVMJitTypes::Reg; 2833 2834 SkTHashMap<int, A::Label> constants; // Constants (mostly splats) share the same pool. 2835 A::Label iota; // Varies per lane, for Op::index. 2836 A::Label load64_index; // Used to load low or high half of 64-bit lanes. 2837 2838 // The `regs` array tracks everything we know about each register's state: 2839 // - NA: empty 2840 // - RES: reserved by ABI 2841 // - TMP: holding a temporary 2842 // - id: holding Val id 2843 constexpr Val RES = NA-1, 2844 TMP = RES-1; 2845 2846 // Map val -> stack slot. 2847 std::vector<int> stack_slot(instructions.size(), NA); 2848 int next_stack_slot = 0; 2849 2850 const int nstack_slots = *stack_hint >= 0 ? *stack_hint 2851 : stack_slot.size(); 2852 #if defined(__x86_64__) || defined(_M_X64) 2853 if (!SkCpu::Supports(SkCpu::HSW)) { 2854 return false; 2855 } 2856 const int K = 8; 2857 #if defined(_M_X64) // Important to check this first; clang-cl defines both. 2858 const A::GP64 N = A::rcx, 2859 GP0 = A::rax, 2860 GP1 = A::r11, 2861 arg[] = { A::rdx, A::r8, A::r9, A::r10, A::rdi, A::rsi }; 2862 2863 // xmm6-15 need are callee-saved. 2864 std::array<Val,16> regs = { 2865 NA, NA, NA, NA, NA, NA,RES,RES, 2866 RES,RES,RES,RES, RES,RES,RES,RES, 2867 }; 2868 const uint32_t incoming_registers_used = *registers_used; 2869 2870 auto enter = [&]{ 2871 // rcx,rdx,r8,r9 are all already holding their correct values. 2872 // Load caller-saved r10 from rsp+40 if there's a fourth arg. 2873 if (fImpl->strides.size() >= 4) { 2874 a->mov(A::r10, A::Mem{A::rsp, 40}); 2875 } 2876 // Load callee-saved rdi from rsp+48 if there's a fifth arg, 2877 // first saving it to ABI reserved shadow area rsp+8. 2878 if (fImpl->strides.size() >= 5) { 2879 a->mov(A::Mem{A::rsp, 8}, A::rdi); 2880 a->mov(A::rdi, A::Mem{A::rsp, 48}); 2881 } 2882 // Load callee-saved rsi from rsp+56 if there's a sixth arg, 2883 // first saving it to ABI reserved shadow area rsp+16. 2884 if (fImpl->strides.size() >= 6) { 2885 a->mov(A::Mem{A::rsp, 16}, A::rsi); 2886 a->mov(A::rsi, A::Mem{A::rsp, 56}); 2887 } 2888 2889 // Allocate stack for our values and callee-saved xmm6-15. 2890 int stack_needed = nstack_slots*K*4; 2891 for (int r = 6; r < 16; r++) { 2892 if (incoming_registers_used & (1<<r)) { 2893 stack_needed += 16; 2894 } 2895 } 2896 if (stack_needed) { a->sub(A::rsp, stack_needed); } 2897 2898 int next_saved_xmm = nstack_slots*K*4; 2899 for (int r = 6; r < 16; r++) { 2900 if (incoming_registers_used & (1<<r)) { 2901 a->vmovups(A::Mem{A::rsp, next_saved_xmm}, (A::Xmm)r); 2902 next_saved_xmm += 16; 2903 regs[r] = NA; 2904 } 2905 } 2906 }; 2907 auto exit = [&]{ 2908 // The second pass of jit() shouldn't use any register it didn't in the first pass. 2909 SkASSERT((*registers_used & incoming_registers_used) == *registers_used); 2910 2911 // Restore callee-saved xmm6-15 and the stack pointer. 2912 int stack_used = nstack_slots*K*4; 2913 for (int r = 6; r < 16; r++) { 2914 if (incoming_registers_used & (1<<r)) { 2915 a->vmovups((A::Xmm)r, A::Mem{A::rsp, stack_used}); 2916 stack_used += 16; 2917 } 2918 } 2919 if (stack_used) { a->add(A::rsp, stack_used); } 2920 2921 // Restore callee-saved rdi/rsi if we used them. 2922 if (fImpl->strides.size() >= 5) { 2923 a->mov(A::rdi, A::Mem{A::rsp, 8}); 2924 } 2925 if (fImpl->strides.size() >= 6) { 2926 a->mov(A::rsi, A::Mem{A::rsp, 16}); 2927 } 2928 2929 a->vzeroupper(); 2930 a->ret(); 2931 }; 2932 #elif defined(__x86_64__) 2933 const A::GP64 N = A::rdi, 2934 GP0 = A::rax, 2935 GP1 = A::r11, 2936 arg[] = { A::rsi, A::rdx, A::rcx, A::r8, A::r9, A::r10 }; 2937 2938 // All 16 ymm registers are available to use. 2939 std::array<Val,16> regs = { 2940 NA,NA,NA,NA, NA,NA,NA,NA, 2941 NA,NA,NA,NA, NA,NA,NA,NA, 2942 }; 2943 2944 auto enter = [&]{ 2945 // Load caller-saved r10 from rsp+8 if there's a sixth arg. 2946 if (fImpl->strides.size() >= 6) { 2947 a->mov(A::r10, A::Mem{A::rsp, 8}); 2948 } 2949 if (nstack_slots) { a->sub(A::rsp, nstack_slots*K*4); } 2950 }; 2951 auto exit = [&]{ 2952 if (nstack_slots) { a->add(A::rsp, nstack_slots*K*4); } 2953 a->vzeroupper(); 2954 a->ret(); 2955 }; 2956 #endif 2957 2958 auto load_from_memory = [&](Reg r, Val v) { 2959 if (instructions[v].op == Op::splat) { 2960 if (instructions[v].immA == 0) { 2961 a->vpxor(r,r,r); 2962 } else { 2963 a->vmovups(r, constants.find(instructions[v].immA)); 2964 } 2965 } else { 2966 SkASSERT(stack_slot[v] != NA); 2967 a->vmovups(r, A::Mem{A::rsp, stack_slot[v]*K*4}); 2968 } 2969 }; 2970 auto store_to_stack = [&](Reg r, Val v) { 2971 SkASSERT(next_stack_slot < nstack_slots); 2972 stack_slot[v] = next_stack_slot++; 2973 a->vmovups(A::Mem{A::rsp, stack_slot[v]*K*4}, r); 2974 }; 2975 #elif defined(__aarch64__) 2976 const int K = 4; 2977 const A::X N = A::x0, 2978 GP0 = A::x8, 2979 GP1 = A::x9, 2980 arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 }; 2981 2982 // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15 in enter/exit. 2983 std::array<Val,32> regs = { 2984 NA, NA, NA, NA, NA, NA, NA, NA, 2985 RES,RES,RES,RES, RES,RES,RES,RES, 2986 NA, NA, NA, NA, NA, NA, NA, NA, 2987 NA, NA, NA, NA, NA, NA, NA, NA, 2988 }; 2989 2990 auto enter = [&]{ if (nstack_slots) { a->sub(A::sp, A::sp, nstack_slots*K*4); } }; 2991 auto exit = [&]{ if (nstack_slots) { a->add(A::sp, A::sp, nstack_slots*K*4); } 2992 a->ret(A::x30); }; 2993 2994 auto load_from_memory = [&](Reg r, Val v) { 2995 if (instructions[v].op == Op::splat) { 2996 if (instructions[v].immA == 0) { 2997 a->eor16b(r,r,r); 2998 } else { 2999 a->ldrq(r, constants.find(instructions[v].immA)); 3000 } 3001 } else { 3002 SkASSERT(stack_slot[v] != NA); 3003 a->ldrq(r, A::sp, stack_slot[v]); 3004 } 3005 }; 3006 auto store_to_stack = [&](Reg r, Val v) { 3007 SkASSERT(next_stack_slot < nstack_slots); 3008 stack_slot[v] = next_stack_slot++; 3009 a->strq(r, A::sp, stack_slot[v]); 3010 }; 3011 #endif 3012 3013 *registers_used = 0; // We'll update this as we go. 3014 3015 if (std::size(arg) < fImpl->strides.size()) { 3016 return false; 3017 } 3018 3019 auto emit = [&](Val id, bool scalar) { 3020 const int active_lanes = scalar ? 1 : K; 3021 const OptimizedInstruction& inst = instructions[id]; 3022 const Op op = inst.op; 3023 const Val x = inst.x, 3024 y = inst.y, 3025 z = inst.z, 3026 w = inst.w; 3027 const int immA = inst.immA, 3028 immB = inst.immB, 3029 immC = inst.immC; 3030 3031 // alloc_tmp() returns the first of N adjacent temporary registers, 3032 // each freed manually with free_tmp() or noted as our result with mark_tmp_as_dst(). 3033 auto alloc_tmp = [&](int N=1) -> Reg { 3034 auto needs_spill = [&](Val v) -> bool { 3035 SkASSERT(v >= 0); // {NA,TMP,RES} need to be handled before calling this. 3036 return stack_slot[v] == NA // We haven't spilled it already? 3037 && instructions[v].op != Op::splat; // No need to spill constants. 3038 }; 3039 3040 // We want to find a block of N adjacent registers requiring the fewest spills. 3041 int best_block = -1, 3042 min_spills = 0x7fff'ffff; 3043 for (int block = 0; block+N <= (int)regs.size(); block++) { 3044 int spills = 0; 3045 for (int r = block; r < block+N; r++) { 3046 Val v = regs[r]; 3047 // Registers holding NA (nothing) are ideal, nothing to spill. 3048 if (v == NA) { 3049 continue; 3050 } 3051 // We can't spill anything REServed or that we'll need this instruction. 3052 if (v == RES || 3053 v == TMP || v == id || v == x || v == y || v == z || v == w) { 3054 spills = 0x7fff'ffff; 3055 block = r; // (optimization) continue outer loop at next register. 3056 break; 3057 } 3058 // Usually here we've got a value v that we'd have to spill to the stack 3059 // before reusing its register, but sometimes even now we get a freebie. 3060 spills += needs_spill(v) ? 1 : 0; 3061 } 3062 3063 // TODO: non-arbitrary tie-breaking? 3064 if (min_spills > spills) { 3065 min_spills = spills; 3066 best_block = block; 3067 } 3068 if (min_spills == 0) { 3069 break; // (optimization) stop early if we find an unbeatable block. 3070 } 3071 } 3072 3073 // TODO: our search's success isn't obviously guaranteed... it depends on N 3074 // and the number and relative position in regs of any unspillable values. 3075 // I think we should be able to get away with N≤2 on x86-64 and N≤4 on arm64; 3076 // we'll need to revisit this logic should this assert fire. 3077 SkASSERT(min_spills <= N); 3078 3079 // Spill what needs spilling, and mark the block all as TMP. 3080 for (int r = best_block; r < best_block+N; r++) { 3081 Val& v = regs[r]; 3082 *registers_used |= (1<<r); 3083 3084 SkASSERT(v == NA || v >= 0); 3085 if (v >= 0 && needs_spill(v)) { 3086 store_to_stack((Reg)r, v); 3087 SkASSERT(!needs_spill(v)); 3088 min_spills--; 3089 } 3090 3091 v = TMP; 3092 } 3093 SkASSERT(min_spills == 0); 3094 return (Reg)best_block; 3095 }; 3096 3097 auto free_tmp = [&](Reg r) { 3098 SkASSERT(regs[r] == TMP); 3099 regs[r] = NA; 3100 }; 3101 3102 // Which register holds dst,x,y,z,w for this instruction? NA if none does yet. 3103 int rd = NA, 3104 rx = NA, 3105 ry = NA, 3106 rz = NA, 3107 rw = NA; 3108 3109 auto update_regs = [&](Reg r, Val v) { 3110 if (v == id) { rd = r; } 3111 if (v == x) { rx = r; } 3112 if (v == y) { ry = r; } 3113 if (v == z) { rz = r; } 3114 if (v == w) { rw = r; } 3115 return r; 3116 }; 3117 3118 auto find_existing_reg = [&](Val v) -> int { 3119 // Quick-check our working registers. 3120 if (v == id && rd != NA) { return rd; } 3121 if (v == x && rx != NA) { return rx; } 3122 if (v == y && ry != NA) { return ry; } 3123 if (v == z && rz != NA) { return rz; } 3124 if (v == w && rw != NA) { return rw; } 3125 3126 // Search inter-instruction register map. 3127 for (auto [r,val] : SkMakeEnumerate(regs)) { 3128 if (val == v) { 3129 return update_regs((Reg)r, v); 3130 } 3131 } 3132 return NA; 3133 }; 3134 3135 // Return a register for Val, holding that value if it already exists. 3136 // During this instruction all calls to r(v) will return the same register. 3137 auto r = [&](Val v) -> Reg { 3138 SkASSERT(v >= 0); 3139 3140 if (int found = find_existing_reg(v); found != NA) { 3141 return (Reg)found; 3142 } 3143 3144 Reg r = alloc_tmp(); 3145 SkASSERT(regs[r] == TMP); 3146 3147 SkASSERT(v <= id); 3148 if (v < id) { 3149 // If v < id, we're loading one of this instruction's inputs. 3150 // If v == id we're just allocating its destination register. 3151 load_from_memory(r, v); 3152 } 3153 regs[r] = v; 3154 return update_regs(r, v); 3155 }; 3156 3157 auto dies_here = [&](Val v) -> bool { 3158 SkASSERT(v >= 0); 3159 return instructions[v].death == id; 3160 }; 3161 3162 // Alias dst() to r(v) if dies_here(v). 3163 auto try_alias = [&](Val v) -> bool { 3164 SkASSERT(v == x || v == y || v == z || v == w); 3165 if (dies_here(v)) { 3166 rd = r(v); // Vals v and id share a register for this instruction. 3167 regs[rd] = id; // Next instruction, Val id will be in the register, not Val v. 3168 return true; 3169 } 3170 return false; 3171 }; 3172 3173 // Generally r(id), 3174 // but with a hint, try to alias dst() to r(v) if dies_here(v). 3175 auto dst = [&](Val hint1 = NA, Val hint2 = NA) -> Reg { 3176 if (hint1 != NA && try_alias(hint1)) { return r(id); } 3177 if (hint2 != NA && try_alias(hint2)) { return r(id); } 3178 return r(id); 3179 }; 3180 3181 #if defined(__aarch64__) // Nothing sneaky, just unused on x86-64. 3182 auto mark_tmp_as_dst = [&](Reg tmp) { 3183 SkASSERT(regs[tmp] == TMP); 3184 rd = tmp; 3185 regs[rd] = id; 3186 SkASSERT(dst() == tmp); 3187 }; 3188 #endif 3189 3190 #if defined(__x86_64__) || defined(_M_X64) 3191 // On x86 we can work with many values directly from the stack or program constant pool. 3192 auto any = [&](Val v) -> A::Operand { 3193 SkASSERT(v >= 0); 3194 SkASSERT(v < id); 3195 3196 if (int found = find_existing_reg(v); found != NA) { 3197 return (Reg)found; 3198 } 3199 if (instructions[v].op == Op::splat) { 3200 return constants.find(instructions[v].immA); 3201 } 3202 return A::Mem{A::rsp, stack_slot[v]*K*4}; 3203 }; 3204 3205 // This is never really worth asking except when any() might be used; 3206 // if we need this value in ARM, might as well just call r(v) to get it into a register. 3207 auto in_reg = [&](Val v) -> bool { 3208 return find_existing_reg(v) != NA; 3209 }; 3210 #endif 3211 3212 switch (op) { 3213 // Make sure splat constants can be found by load_from_memory() or any(). 3214 case Op::splat: 3215 (void)constants[immA]; 3216 break; 3217 3218 #if defined(__x86_64__) || defined(_M_X64) 3219 case Op::assert_true: { 3220 a->vptest (r(x), &constants[0xffffffff]); 3221 A::Label all_true; 3222 a->jc(&all_true); 3223 a->int3(); 3224 a->label(&all_true); 3225 } break; 3226 3227 case Op::trace_line: 3228 case Op::trace_var: 3229 case Op::trace_enter: 3230 case Op::trace_exit: 3231 case Op::trace_scope: 3232 /* Force this program to run in the interpreter. */ 3233 return false; 3234 3235 case Op::store8: 3236 if (scalar) { 3237 a->vpextrb(A::Mem{arg[immA]}, (A::Xmm)r(x), 0); 3238 } else { 3239 a->vpackusdw(dst(x), r(x), r(x)); 3240 a->vpermq (dst(), dst(), 0xd8); 3241 a->vpackuswb(dst(), dst(), dst()); 3242 a->vmovq (A::Mem{arg[immA]}, (A::Xmm)dst()); 3243 } break; 3244 3245 case Op::store16: 3246 if (scalar) { 3247 a->vpextrw(A::Mem{arg[immA]}, (A::Xmm)r(x), 0); 3248 } else { 3249 a->vpackusdw(dst(x), r(x), r(x)); 3250 a->vpermq (dst(), dst(), 0xd8); 3251 a->vmovups (A::Mem{arg[immA]}, (A::Xmm)dst()); 3252 } break; 3253 3254 case Op::store32: if (scalar) { a->vmovd (A::Mem{arg[immA]}, (A::Xmm)r(x)); } 3255 else { a->vmovups(A::Mem{arg[immA]}, r(x)); } 3256 break; 3257 3258 case Op::store64: if (scalar) { 3259 a->vmovd(A::Mem{arg[immA],0}, (A::Xmm)r(x)); 3260 a->vmovd(A::Mem{arg[immA],4}, (A::Xmm)r(y)); 3261 } else { 3262 // r(x) = {a,b,c,d|e,f,g,h} 3263 // r(y) = {i,j,k,l|m,n,o,p} 3264 // We want to write a,i,b,j,c,k,d,l,e,m... 3265 A::Ymm L = alloc_tmp(), 3266 H = alloc_tmp(); 3267 a->vpunpckldq(L, r(x), any(y)); // L = {a,i,b,j|e,m,f,n} 3268 a->vpunpckhdq(H, r(x), any(y)); // H = {c,k,d,l|g,o,h,p} 3269 a->vperm2f128(dst(), L,H, 0x20); // = {a,i,b,j|c,k,d,l} 3270 a->vmovups(A::Mem{arg[immA], 0}, dst()); 3271 a->vperm2f128(dst(), L,H, 0x31); // = {e,m,f,n|g,o,h,p} 3272 a->vmovups(A::Mem{arg[immA],32}, dst()); 3273 free_tmp(L); 3274 free_tmp(H); 3275 } break; 3276 3277 case Op::store128: { 3278 // TODO: >32-bit stores 3279 a->vmovd (A::Mem{arg[immA], 0*16 + 0}, (A::Xmm)r(x) ); 3280 a->vmovd (A::Mem{arg[immA], 0*16 + 4}, (A::Xmm)r(y) ); 3281 a->vmovd (A::Mem{arg[immA], 0*16 + 8}, (A::Xmm)r(z) ); 3282 a->vmovd (A::Mem{arg[immA], 0*16 + 12}, (A::Xmm)r(w) ); 3283 if (scalar) { break; } 3284 3285 a->vpextrd(A::Mem{arg[immA], 1*16 + 0}, (A::Xmm)r(x), 1); 3286 a->vpextrd(A::Mem{arg[immA], 1*16 + 4}, (A::Xmm)r(y), 1); 3287 a->vpextrd(A::Mem{arg[immA], 1*16 + 8}, (A::Xmm)r(z), 1); 3288 a->vpextrd(A::Mem{arg[immA], 1*16 + 12}, (A::Xmm)r(w), 1); 3289 3290 a->vpextrd(A::Mem{arg[immA], 2*16 + 0}, (A::Xmm)r(x), 2); 3291 a->vpextrd(A::Mem{arg[immA], 2*16 + 4}, (A::Xmm)r(y), 2); 3292 a->vpextrd(A::Mem{arg[immA], 2*16 + 8}, (A::Xmm)r(z), 2); 3293 a->vpextrd(A::Mem{arg[immA], 2*16 + 12}, (A::Xmm)r(w), 2); 3294 3295 a->vpextrd(A::Mem{arg[immA], 3*16 + 0}, (A::Xmm)r(x), 3); 3296 a->vpextrd(A::Mem{arg[immA], 3*16 + 4}, (A::Xmm)r(y), 3); 3297 a->vpextrd(A::Mem{arg[immA], 3*16 + 8}, (A::Xmm)r(z), 3); 3298 a->vpextrd(A::Mem{arg[immA], 3*16 + 12}, (A::Xmm)r(w), 3); 3299 // Now we need to store the upper 128 bits of x,y,z,w. 3300 // Storing in this order rather than interlacing minimizes temporaries. 3301 a->vextracti128(dst(), r(x), 1); 3302 a->vmovd (A::Mem{arg[immA], 4*16 + 0}, (A::Xmm)dst() ); 3303 a->vpextrd(A::Mem{arg[immA], 5*16 + 0}, (A::Xmm)dst(), 1); 3304 a->vpextrd(A::Mem{arg[immA], 6*16 + 0}, (A::Xmm)dst(), 2); 3305 a->vpextrd(A::Mem{arg[immA], 7*16 + 0}, (A::Xmm)dst(), 3); 3306 3307 a->vextracti128(dst(), r(y), 1); 3308 a->vmovd (A::Mem{arg[immA], 4*16 + 4}, (A::Xmm)dst() ); 3309 a->vpextrd(A::Mem{arg[immA], 5*16 + 4}, (A::Xmm)dst(), 1); 3310 a->vpextrd(A::Mem{arg[immA], 6*16 + 4}, (A::Xmm)dst(), 2); 3311 a->vpextrd(A::Mem{arg[immA], 7*16 + 4}, (A::Xmm)dst(), 3); 3312 3313 a->vextracti128(dst(), r(z), 1); 3314 a->vmovd (A::Mem{arg[immA], 4*16 + 8}, (A::Xmm)dst() ); 3315 a->vpextrd(A::Mem{arg[immA], 5*16 + 8}, (A::Xmm)dst(), 1); 3316 a->vpextrd(A::Mem{arg[immA], 6*16 + 8}, (A::Xmm)dst(), 2); 3317 a->vpextrd(A::Mem{arg[immA], 7*16 + 8}, (A::Xmm)dst(), 3); 3318 3319 a->vextracti128(dst(), r(w), 1); 3320 a->vmovd (A::Mem{arg[immA], 4*16 + 12}, (A::Xmm)dst() ); 3321 a->vpextrd(A::Mem{arg[immA], 5*16 + 12}, (A::Xmm)dst(), 1); 3322 a->vpextrd(A::Mem{arg[immA], 6*16 + 12}, (A::Xmm)dst(), 2); 3323 a->vpextrd(A::Mem{arg[immA], 7*16 + 12}, (A::Xmm)dst(), 3); 3324 } break; 3325 3326 case Op::load8: if (scalar) { 3327 a->vpxor (dst(), dst(), dst()); 3328 a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0); 3329 } else { 3330 a->vpmovzxbd(dst(), A::Mem{arg[immA]}); 3331 } break; 3332 3333 case Op::load16: if (scalar) { 3334 a->vpxor (dst(), dst(), dst()); 3335 a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0); 3336 } else { 3337 a->vpmovzxwd(dst(), A::Mem{arg[immA]}); 3338 } break; 3339 3340 case Op::load32: if (scalar) { a->vmovd ((A::Xmm)dst(), A::Mem{arg[immA]}); } 3341 else { a->vmovups( dst(), A::Mem{arg[immA]}); } 3342 break; 3343 3344 case Op::load64: if (scalar) { 3345 a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB}); 3346 } else { 3347 A::Ymm tmp = alloc_tmp(); 3348 a->vmovups(tmp, &load64_index); 3349 a->vpermps(dst(), tmp, A::Mem{arg[immA], 0}); 3350 a->vpermps( tmp, tmp, A::Mem{arg[immA], 32}); 3351 // Low 128 bits holds immB=0 lanes, high 128 bits holds immB=1. 3352 a->vperm2f128(dst(), dst(),tmp, immB ? 0x31 : 0x20); 3353 free_tmp(tmp); 3354 } break; 3355 3356 case Op::load128: if (scalar) { 3357 a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB}); 3358 } else { 3359 // Load 4 low values into xmm tmp, 3360 A::Ymm tmp = alloc_tmp(); 3361 A::Xmm t = (A::Xmm)tmp; 3362 a->vmovd (t, A::Mem{arg[immA], 0*16 + 4*immB} ); 3363 a->vpinsrd(t,t, A::Mem{arg[immA], 1*16 + 4*immB}, 1); 3364 a->vpinsrd(t,t, A::Mem{arg[immA], 2*16 + 4*immB}, 2); 3365 a->vpinsrd(t,t, A::Mem{arg[immA], 3*16 + 4*immB}, 3); 3366 3367 // Load 4 high values into xmm dst(), 3368 A::Xmm d = (A::Xmm)dst(); 3369 a->vmovd (d, A::Mem{arg[immA], 4*16 + 4*immB} ); 3370 a->vpinsrd(d,d, A::Mem{arg[immA], 5*16 + 4*immB}, 1); 3371 a->vpinsrd(d,d, A::Mem{arg[immA], 6*16 + 4*immB}, 2); 3372 a->vpinsrd(d,d, A::Mem{arg[immA], 7*16 + 4*immB}, 3); 3373 3374 // Merge the two, ymm dst() = {xmm tmp|xmm dst()} 3375 a->vperm2f128(dst(), tmp,dst(), 0x20); 3376 free_tmp(tmp); 3377 } break; 3378 3379 case Op::gather8: { 3380 // As usual, the gather base pointer is immB bytes off of uniform immA. 3381 a->mov(GP0, A::Mem{arg[immA], immB}); 3382 3383 A::Ymm tmp = alloc_tmp(); 3384 a->vmovups(tmp, any(x)); 3385 3386 for (int i = 0; i < active_lanes; i++) { 3387 if (i == 4) { 3388 // vpextrd can only pluck indices out from an Xmm register, 3389 // so we manually swap over to the top when we're halfway through. 3390 a->vextracti128((A::Xmm)tmp, tmp, 1); 3391 } 3392 a->vpextrd(GP1, (A::Xmm)tmp, i%4); 3393 a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::ONE}, i); 3394 } 3395 a->vpmovzxbd(dst(), dst()); 3396 free_tmp(tmp); 3397 } break; 3398 3399 case Op::gather16: { 3400 // Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd. 3401 a->mov(GP0, A::Mem{arg[immA], immB}); 3402 3403 A::Ymm tmp = alloc_tmp(); 3404 a->vmovups(tmp, any(x)); 3405 3406 for (int i = 0; i < active_lanes; i++) { 3407 if (i == 4) { 3408 a->vextracti128((A::Xmm)tmp, tmp, 1); 3409 } 3410 a->vpextrd(GP1, (A::Xmm)tmp, i%4); 3411 a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::TWO}, i); 3412 } 3413 a->vpmovzxwd(dst(), dst()); 3414 free_tmp(tmp); 3415 } break; 3416 3417 case Op::gather32: 3418 if (scalar) { 3419 // Our gather base pointer is immB bytes off of uniform immA. 3420 a->mov(GP0, A::Mem{arg[immA], immB}); 3421 3422 // Grab our index from lane 0 of the index argument. 3423 a->vmovd(GP1, (A::Xmm)r(x)); 3424 3425 // dst = *(base + 4*index) 3426 a->vmovd((A::Xmm)dst(x), A::Mem{GP0, 0, GP1, A::FOUR}); 3427 } else { 3428 a->mov(GP0, A::Mem{arg[immA], immB}); 3429 3430 A::Ymm mask = alloc_tmp(); 3431 a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.) 3432 3433 a->vgatherdps(dst(), A::FOUR, r(x), GP0, mask); 3434 free_tmp(mask); 3435 } 3436 break; 3437 3438 case Op::uniform32: a->vbroadcastss(dst(), A::Mem{arg[immA], immB}); 3439 break; 3440 3441 case Op::array32: a->mov(GP0, A::Mem{arg[immA], immB}); 3442 a->vbroadcastss(dst(), A::Mem{GP0, immC}); 3443 break; 3444 3445 case Op::index: a->vmovd((A::Xmm)dst(), N); 3446 a->vbroadcastss(dst(), dst()); 3447 a->vpsubd(dst(), dst(), &iota); 3448 break; 3449 3450 // We can swap the arguments of symmetric instructions to make better use of any(). 3451 case Op::add_f32: 3452 if (in_reg(x)) { a->vaddps(dst(x), r(x), any(y)); } 3453 else { a->vaddps(dst(y), r(y), any(x)); } 3454 break; 3455 3456 case Op::mul_f32: 3457 if (in_reg(x)) { a->vmulps(dst(x), r(x), any(y)); } 3458 else { a->vmulps(dst(y), r(y), any(x)); } 3459 break; 3460 3461 case Op::sub_f32: a->vsubps(dst(x), r(x), any(y)); break; 3462 case Op::div_f32: a->vdivps(dst(x), r(x), any(y)); break; 3463 case Op::min_f32: a->vminps(dst(y), r(y), any(x)); break; // Order matters, 3464 case Op::max_f32: a->vmaxps(dst(y), r(y), any(x)); break; // see test SkVM_min_max. 3465 3466 case Op::fma_f32: 3467 if (try_alias(x)) { a->vfmadd132ps(dst(x), r(z), any(y)); } else 3468 if (try_alias(y)) { a->vfmadd213ps(dst(y), r(x), any(z)); } else 3469 if (try_alias(z)) { a->vfmadd231ps(dst(z), r(x), any(y)); } else 3470 { a->vmovups (dst(), any(x)); 3471 a->vfmadd132ps(dst(), r(z), any(y)); } 3472 break; 3473 3474 case Op::fms_f32: 3475 if (try_alias(x)) { a->vfmsub132ps(dst(x), r(z), any(y)); } else 3476 if (try_alias(y)) { a->vfmsub213ps(dst(y), r(x), any(z)); } else 3477 if (try_alias(z)) { a->vfmsub231ps(dst(z), r(x), any(y)); } else 3478 { a->vmovups (dst(), any(x)); 3479 a->vfmsub132ps(dst(), r(z), any(y)); } 3480 break; 3481 3482 case Op::fnma_f32: 3483 if (try_alias(x)) { a->vfnmadd132ps(dst(x), r(z), any(y)); } else 3484 if (try_alias(y)) { a->vfnmadd213ps(dst(y), r(x), any(z)); } else 3485 if (try_alias(z)) { a->vfnmadd231ps(dst(z), r(x), any(y)); } else 3486 { a->vmovups (dst(), any(x)); 3487 a->vfnmadd132ps(dst(), r(z), any(y)); } 3488 break; 3489 3490 // In situations like this we want to try aliasing dst(x) when x is 3491 // already in a register, but not if we'd have to load it from the stack 3492 // just to alias it. That's done better directly into the new register. 3493 case Op::sqrt_f32: 3494 if (in_reg(x)) { a->vsqrtps(dst(x), r(x)); } 3495 else { a->vsqrtps(dst(), any(x)); } 3496 break; 3497 3498 case Op::add_i32: 3499 if (in_reg(x)) { a->vpaddd(dst(x), r(x), any(y)); } 3500 else { a->vpaddd(dst(y), r(y), any(x)); } 3501 break; 3502 3503 case Op::mul_i32: 3504 if (in_reg(x)) { a->vpmulld(dst(x), r(x), any(y)); } 3505 else { a->vpmulld(dst(y), r(y), any(x)); } 3506 break; 3507 3508 case Op::sub_i32: a->vpsubd(dst(x), r(x), any(y)); break; 3509 3510 case Op::bit_and: 3511 if (in_reg(x)) { a->vpand(dst(x), r(x), any(y)); } 3512 else { a->vpand(dst(y), r(y), any(x)); } 3513 break; 3514 case Op::bit_or: 3515 if (in_reg(x)) { a->vpor(dst(x), r(x), any(y)); } 3516 else { a->vpor(dst(y), r(y), any(x)); } 3517 break; 3518 case Op::bit_xor: 3519 if (in_reg(x)) { a->vpxor(dst(x), r(x), any(y)); } 3520 else { a->vpxor(dst(y), r(y), any(x)); } 3521 break; 3522 3523 case Op::bit_clear: a->vpandn(dst(y), r(y), any(x)); break; // Notice, y then x. 3524 3525 case Op::select: 3526 if (try_alias(z)) { a->vpblendvb(dst(z), r(z), any(y), r(x)); } 3527 else { a->vpblendvb(dst(x), r(z), any(y), r(x)); } 3528 break; 3529 3530 case Op::shl_i32: a->vpslld(dst(x), r(x), immA); break; 3531 case Op::shr_i32: a->vpsrld(dst(x), r(x), immA); break; 3532 case Op::sra_i32: a->vpsrad(dst(x), r(x), immA); break; 3533 3534 case Op::eq_i32: 3535 if (in_reg(x)) { a->vpcmpeqd(dst(x), r(x), any(y)); } 3536 else { a->vpcmpeqd(dst(y), r(y), any(x)); } 3537 break; 3538 3539 case Op::gt_i32: a->vpcmpgtd(dst(), r(x), any(y)); break; 3540 3541 case Op::eq_f32: 3542 if (in_reg(x)) { a->vcmpeqps(dst(x), r(x), any(y)); } 3543 else { a->vcmpeqps(dst(y), r(y), any(x)); } 3544 break; 3545 case Op::neq_f32: 3546 if (in_reg(x)) { a->vcmpneqps(dst(x), r(x), any(y)); } 3547 else { a->vcmpneqps(dst(y), r(y), any(x)); } 3548 break; 3549 3550 case Op:: gt_f32: a->vcmpltps (dst(y), r(y), any(x)); break; 3551 case Op::gte_f32: a->vcmpleps (dst(y), r(y), any(x)); break; 3552 3553 case Op::ceil: 3554 if (in_reg(x)) { a->vroundps(dst(x), r(x), Assembler::CEIL); } 3555 else { a->vroundps(dst(), any(x), Assembler::CEIL); } 3556 break; 3557 3558 case Op::floor: 3559 if (in_reg(x)) { a->vroundps(dst(x), r(x), Assembler::FLOOR); } 3560 else { a->vroundps(dst(), any(x), Assembler::FLOOR); } 3561 break; 3562 3563 case Op::to_f32: 3564 if (in_reg(x)) { a->vcvtdq2ps(dst(x), r(x)); } 3565 else { a->vcvtdq2ps(dst(), any(x)); } 3566 break; 3567 3568 case Op::trunc: 3569 if (in_reg(x)) { a->vcvttps2dq(dst(x), r(x)); } 3570 else { a->vcvttps2dq(dst(), any(x)); } 3571 break; 3572 3573 case Op::round: 3574 if (in_reg(x)) { a->vcvtps2dq(dst(x), r(x)); } 3575 else { a->vcvtps2dq(dst(), any(x)); } 3576 break; 3577 3578 case Op::to_fp16: 3579 a->vcvtps2ph(dst(x), r(x), A::CURRENT); // f32 ymm -> f16 xmm 3580 a->vpmovzxwd(dst(), dst()); // f16 xmm -> f16 ymm 3581 break; 3582 3583 case Op::from_fp16: 3584 a->vpackusdw(dst(x), r(x), r(x)); // f16 ymm -> f16 xmm 3585 a->vpermq (dst(), dst(), 0xd8); // swap middle two 64-bit lanes 3586 a->vcvtph2ps(dst(), dst()); // f16 xmm -> f32 ymm 3587 break; 3588 3589 case Op::duplicate: break; 3590 3591 #elif defined(__aarch64__) 3592 case Op::assert_true: { 3593 a->uminv4s(dst(), r(x)); // uminv acts like an all() across the vector. 3594 a->movs(GP0, dst(), 0); 3595 A::Label all_true; 3596 a->cbnz(GP0, &all_true); 3597 a->brk(0); 3598 a->label(&all_true); 3599 } break; 3600 3601 case Op::trace_line: 3602 case Op::trace_var: 3603 case Op::trace_enter: 3604 case Op::trace_exit: 3605 case Op::trace_scope: 3606 /* Force this program to run in the interpreter. */ 3607 return false; 3608 3609 case Op::index: { 3610 A::V tmp = alloc_tmp(); 3611 a->ldrq (tmp, &iota); 3612 a->dup4s(dst(), N); 3613 a->sub4s(dst(), dst(), tmp); 3614 free_tmp(tmp); 3615 } break; 3616 3617 case Op::store8: a->xtns2h(dst(x), r(x)); 3618 a->xtnh2b(dst(), dst()); 3619 if (scalar) { a->strb (dst(), arg[immA]); } 3620 else { a->strs (dst(), arg[immA]); } 3621 break; 3622 3623 case Op::store16: a->xtns2h(dst(x), r(x)); 3624 if (scalar) { a->strh (dst(), arg[immA]); } 3625 else { a->strd (dst(), arg[immA]); } 3626 break; 3627 3628 case Op::store32: if (scalar) { a->strs(r(x), arg[immA]); } 3629 else { a->strq(r(x), arg[immA]); } 3630 break; 3631 3632 case Op::store64: if (scalar) { 3633 a->strs(r(x), arg[immA], 0); 3634 a->strs(r(y), arg[immA], 1); 3635 } else if (r(y) == r(x)+1) { 3636 a->st24s(r(x), arg[immA]); 3637 } else { 3638 Reg tmp0 = alloc_tmp(2), 3639 tmp1 = (Reg)(tmp0+1); 3640 a->orr16b(tmp0, r(x), r(x)); 3641 a->orr16b(tmp1, r(y), r(y)); 3642 a-> st24s(tmp0, arg[immA]); 3643 free_tmp(tmp0); 3644 free_tmp(tmp1); 3645 } break; 3646 3647 case Op::store128: 3648 if (scalar) { 3649 a->strs(r(x), arg[immA], 0); 3650 a->strs(r(y), arg[immA], 1); 3651 a->strs(r(z), arg[immA], 2); 3652 a->strs(r(w), arg[immA], 3); 3653 } else if (r(y) == r(x)+1 && 3654 r(z) == r(x)+2 && 3655 r(w) == r(x)+3) { 3656 a->st44s(r(x), arg[immA]); 3657 } else { 3658 Reg tmp0 = alloc_tmp(4), 3659 tmp1 = (Reg)(tmp0+1), 3660 tmp2 = (Reg)(tmp0+2), 3661 tmp3 = (Reg)(tmp0+3); 3662 a->orr16b(tmp0, r(x), r(x)); 3663 a->orr16b(tmp1, r(y), r(y)); 3664 a->orr16b(tmp2, r(z), r(z)); 3665 a->orr16b(tmp3, r(w), r(w)); 3666 a-> st44s(tmp0, arg[immA]); 3667 free_tmp(tmp0); 3668 free_tmp(tmp1); 3669 free_tmp(tmp2); 3670 free_tmp(tmp3); 3671 } break; 3672 3673 3674 case Op::load8: if (scalar) { a->ldrb(dst(), arg[immA]); } 3675 else { a->ldrs(dst(), arg[immA]); } 3676 a->uxtlb2h(dst(), dst()); 3677 a->uxtlh2s(dst(), dst()); 3678 break; 3679 3680 case Op::load16: if (scalar) { a->ldrh(dst(), arg[immA]); } 3681 else { a->ldrd(dst(), arg[immA]); } 3682 a->uxtlh2s(dst(), dst()); 3683 break; 3684 3685 case Op::load32: if (scalar) { a->ldrs(dst(), arg[immA]); } 3686 else { a->ldrq(dst(), arg[immA]); } 3687 break; 3688 3689 case Op::load64: if (scalar) { 3690 a->ldrs(dst(), arg[immA], immB); 3691 } else { 3692 Reg tmp0 = alloc_tmp(2), 3693 tmp1 = (Reg)(tmp0+1); 3694 a->ld24s(tmp0, arg[immA]); 3695 // TODO: return both 3696 switch (immB) { 3697 case 0: mark_tmp_as_dst(tmp0); free_tmp(tmp1); break; 3698 case 1: mark_tmp_as_dst(tmp1); free_tmp(tmp0); break; 3699 } 3700 } break; 3701 3702 case Op::load128: if (scalar) { 3703 a->ldrs(dst(), arg[immA], immB); 3704 } else { 3705 Reg tmp0 = alloc_tmp(4), 3706 tmp1 = (Reg)(tmp0+1), 3707 tmp2 = (Reg)(tmp0+2), 3708 tmp3 = (Reg)(tmp0+3); 3709 a->ld44s(tmp0, arg[immA]); 3710 // TODO: return all four 3711 switch (immB) { 3712 case 0: mark_tmp_as_dst(tmp0); break; 3713 case 1: mark_tmp_as_dst(tmp1); break; 3714 case 2: mark_tmp_as_dst(tmp2); break; 3715 case 3: mark_tmp_as_dst(tmp3); break; 3716 } 3717 if (immB != 0) { free_tmp(tmp0); } 3718 if (immB != 1) { free_tmp(tmp1); } 3719 if (immB != 2) { free_tmp(tmp2); } 3720 if (immB != 3) { free_tmp(tmp3); } 3721 } break; 3722 3723 case Op::uniform32: a->add(GP0, arg[immA], immB); 3724 a->ld1r4s(dst(), GP0); 3725 break; 3726 3727 case Op::array32: a->add(GP0, arg[immA], immB); 3728 a->ldrd(GP0, GP0); 3729 a->add(GP0, GP0, immC); 3730 a->ld1r4s(dst(), GP0); 3731 break; 3732 3733 case Op::gather8: { 3734 // As usual, the gather base pointer is immB bytes off of uniform immA. 3735 a->add (GP0, arg[immA], immB); // GP0 = &(gather base pointer) 3736 a->ldrd(GP0, GP0); // GP0 = gather base pointer 3737 3738 for (int i = 0; i < active_lanes; i++) { 3739 a->movs(GP1, r(x), i); // Extract index lane i into GP1. 3740 a->add (GP1, GP0, GP1); // Add the gather base pointer. 3741 a->ldrb(GP1, GP1); // Load that byte. 3742 a->inss(dst(x), GP1, i); // Insert it into dst() lane i. 3743 } 3744 } break; 3745 3746 // See gather8 for general idea; comments here only where gather16 differs. 3747 case Op::gather16: { 3748 a->add (GP0, arg[immA], immB); 3749 a->ldrd(GP0, GP0); 3750 for (int i = 0; i < active_lanes; i++) { 3751 a->movs(GP1, r(x), i); 3752 a->add (GP1, GP0, GP1, A::LSL, 1); // Scale index 2x into a byte offset. 3753 a->ldrh(GP1, GP1); // 2-byte load. 3754 a->inss(dst(x), GP1, i); 3755 } 3756 } break; 3757 3758 // See gather8 for general idea; comments here only where gather32 differs. 3759 case Op::gather32: { 3760 a->add (GP0, arg[immA], immB); 3761 a->ldrd(GP0, GP0); 3762 for (int i = 0; i < active_lanes; i++) { 3763 a->movs(GP1, r(x), i); 3764 a->add (GP1, GP0, GP1, A::LSL, 2); // Scale index 4x into a byte offset. 3765 a->ldrs(GP1, GP1); // 4-byte load. 3766 a->inss(dst(x), GP1, i); 3767 } 3768 } break; 3769 3770 case Op::add_f32: a->fadd4s(dst(x,y), r(x), r(y)); break; 3771 case Op::sub_f32: a->fsub4s(dst(x,y), r(x), r(y)); break; 3772 case Op::mul_f32: a->fmul4s(dst(x,y), r(x), r(y)); break; 3773 case Op::div_f32: a->fdiv4s(dst(x,y), r(x), r(y)); break; 3774 3775 case Op::sqrt_f32: a->fsqrt4s(dst(x), r(x)); break; 3776 3777 case Op::fma_f32: // fmla.4s is z += x*y 3778 if (try_alias(z)) { a->fmla4s( r(z), r(x), r(y)); } 3779 else { a->orr16b(dst(), r(z), r(z)); 3780 a->fmla4s(dst(), r(x), r(y)); } 3781 break; 3782 3783 case Op::fnma_f32: // fmls.4s is z -= x*y 3784 if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); } 3785 else { a->orr16b(dst(), r(z), r(z)); 3786 a->fmls4s(dst(), r(x), r(y)); } 3787 break; 3788 3789 case Op::fms_f32: // calculate z - xy, then negate to xy - z 3790 if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); } 3791 else { a->orr16b(dst(), r(z), r(z)); 3792 a->fmls4s(dst(), r(x), r(y)); } 3793 a->fneg4s(dst(), dst()); 3794 break; 3795 3796 case Op:: gt_f32: a->fcmgt4s (dst(x,y), r(x), r(y)); break; 3797 case Op::gte_f32: a->fcmge4s (dst(x,y), r(x), r(y)); break; 3798 case Op:: eq_f32: a->fcmeq4s (dst(x,y), r(x), r(y)); break; 3799 case Op::neq_f32: a->fcmeq4s (dst(x,y), r(x), r(y)); 3800 a->not16b (dst(), dst()); break; 3801 3802 3803 case Op::add_i32: a->add4s(dst(x,y), r(x), r(y)); break; 3804 case Op::sub_i32: a->sub4s(dst(x,y), r(x), r(y)); break; 3805 case Op::mul_i32: a->mul4s(dst(x,y), r(x), r(y)); break; 3806 3807 case Op::bit_and : a->and16b(dst(x,y), r(x), r(y)); break; 3808 case Op::bit_or : a->orr16b(dst(x,y), r(x), r(y)); break; 3809 case Op::bit_xor : a->eor16b(dst(x,y), r(x), r(y)); break; 3810 case Op::bit_clear: a->bic16b(dst(x,y), r(x), r(y)); break; 3811 3812 case Op::select: // bsl16b is x = x ? y : z 3813 if (try_alias(x)) { a->bsl16b( r(x), r(y), r(z)); } 3814 else { a->orr16b(dst(), r(x), r(x)); 3815 a->bsl16b(dst(), r(y), r(z)); } 3816 break; 3817 3818 // fmin4s and fmax4s don't work the way we want with NaN, 3819 // so we write them the long way: 3820 case Op::min_f32: // min(x,y) = y<x ? y : x 3821 a->fcmgt4s(dst(), r(x), r(y)); 3822 a->bsl16b (dst(), r(y), r(x)); 3823 break; 3824 3825 case Op::max_f32: // max(x,y) = x<y ? y : x 3826 a->fcmgt4s(dst(), r(y), r(x)); 3827 a->bsl16b (dst(), r(y), r(x)); 3828 break; 3829 3830 case Op::shl_i32: a-> shl4s(dst(x), r(x), immA); break; 3831 case Op::shr_i32: a->ushr4s(dst(x), r(x), immA); break; 3832 case Op::sra_i32: a->sshr4s(dst(x), r(x), immA); break; 3833 3834 case Op::eq_i32: a->cmeq4s(dst(x,y), r(x), r(y)); break; 3835 case Op::gt_i32: a->cmgt4s(dst(x,y), r(x), r(y)); break; 3836 3837 case Op::to_f32: a->scvtf4s (dst(x), r(x)); break; 3838 case Op::trunc: a->fcvtzs4s(dst(x), r(x)); break; 3839 case Op::round: a->fcvtns4s(dst(x), r(x)); break; 3840 case Op::ceil: a->frintp4s(dst(x), r(x)); break; 3841 case Op::floor: a->frintm4s(dst(x), r(x)); break; 3842 3843 case Op::to_fp16: 3844 a->fcvtn (dst(x), r(x)); // 4x f32 -> 4x f16 in bottom four lanes 3845 a->uxtlh2s(dst(), dst()); // expand to 4x f16 in even 16-bit lanes 3846 break; 3847 3848 case Op::from_fp16: 3849 a->xtns2h(dst(x), r(x)); // pack even 16-bit lanes into bottom four lanes 3850 a->fcvtl (dst(), dst()); // 4x f16 -> 4x f32 3851 break; 3852 3853 case Op::duplicate: break; 3854 #endif 3855 } 3856 3857 // Proactively free the registers holding any value that dies here. 3858 if (rd != NA && dies_here(regs[rd])) { regs[rd] = NA; } 3859 if (rx != NA && regs[rx] != NA && dies_here(regs[rx])) { regs[rx] = NA; } 3860 if (ry != NA && regs[ry] != NA && dies_here(regs[ry])) { regs[ry] = NA; } 3861 if (rz != NA && regs[rz] != NA && dies_here(regs[rz])) { regs[rz] = NA; } 3862 if (rw != NA && regs[rw] != NA && dies_here(regs[rw])) { regs[rw] = NA; } 3863 return true; 3864 }; 3865 3866 #if defined(__x86_64__) || defined(_M_X64) 3867 auto jump_if_less = [&](A::Label* l) { a->jl (l); }; 3868 auto jump = [&](A::Label* l) { a->jmp(l); }; 3869 3870 auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); }; 3871 auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); }; 3872 #elif defined(__aarch64__) 3873 auto jump_if_less = [&](A::Label* l) { a->blt(l); }; 3874 auto jump = [&](A::Label* l) { a->b (l); }; 3875 3876 auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); }; 3877 auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); }; 3878 #endif 3879 3880 A::Label body, 3881 tail, 3882 done; 3883 3884 enter(); 3885 for (Val id = 0; id < (Val)instructions.size(); id++) { 3886 if (fImpl->visualizer && is_trace(instructions[id].op)) { 3887 // Make sure trace commands stay on JIT for visualizer 3888 continue; 3889 } 3890 if (instructions[id].can_hoist && !emit(id, /*scalar=*/false)) { 3891 return false; 3892 } 3893 } 3894 3895 // This point marks a kind of canonical fixed point for register contents: if loop 3896 // code is generated as if these registers are holding these values, the next time 3897 // the loop comes around we'd better find those same registers holding those same values. 3898 auto restore_incoming_regs = [&,incoming=regs,saved_stack_slot=stack_slot, 3899 saved_next_stack_slot=next_stack_slot]{ 3900 for (int r = 0; r < (int)regs.size(); r++) { 3901 if (regs[r] != incoming[r]) { 3902 regs[r] = incoming[r]; 3903 if (regs[r] >= 0) { 3904 load_from_memory((Reg)r, regs[r]); 3905 } 3906 } 3907 } 3908 *stack_hint = std::max(*stack_hint, next_stack_slot); 3909 stack_slot = saved_stack_slot; 3910 next_stack_slot = saved_next_stack_slot; 3911 }; 3912 3913 a->label(&body); 3914 { 3915 a->cmp(N, K); 3916 jump_if_less(&tail); 3917 for (Val id = 0; id < (Val)instructions.size(); id++) { 3918 if (fImpl->visualizer != nullptr && is_trace(instructions[id].op)) { 3919 // Make sure trace commands stay on JIT for visualizer 3920 continue; 3921 } 3922 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/false)) { 3923 return false; 3924 } 3925 } 3926 restore_incoming_regs(); 3927 for (int i = 0; i < (int)fImpl->strides.size(); i++) { 3928 if (fImpl->strides[i]) { 3929 add(arg[i], K*fImpl->strides[i]); 3930 } 3931 } 3932 sub(N, K); 3933 jump(&body); 3934 } 3935 3936 a->label(&tail); 3937 { 3938 a->cmp(N, 1); 3939 jump_if_less(&done); 3940 for (Val id = 0; id < (Val)instructions.size(); id++) { 3941 if (fImpl->visualizer && is_trace(instructions[id].op)) { 3942 // Make sure trace commands stay on JIT for visualizer 3943 continue; 3944 } 3945 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/true)) { 3946 return false; 3947 } 3948 } 3949 restore_incoming_regs(); 3950 for (int i = 0; i < (int)fImpl->strides.size(); i++) { 3951 if (fImpl->strides[i]) { 3952 add(arg[i], 1*fImpl->strides[i]); 3953 } 3954 } 3955 sub(N, 1); 3956 jump(&tail); 3957 } 3958 3959 a->label(&done); 3960 { 3961 exit(); 3962 } 3963 3964 // On ARM64, we use immediate offsets to adjust the stack pointer, and those are limited to 3965 // 12 bits. If our function is going to require more than 4k of stack, just fail. We could 3966 // tweak the code that adjusts `sp`, but then we risk exceeding the (larger) immediate limit 3967 // on our sp-relative load and store opcodes. 3968 #if defined(__aarch64__) 3969 const int stack_bytes = (*stack_hint) * K * 4; 3970 if (stack_bytes > mask(12)) { 3971 return false; 3972 } 3973 #endif 3974 3975 // Except for explicit aligned load and store instructions, AVX allows 3976 // memory operands to be unaligned. So even though we're creating 16 3977 // byte patterns on ARM or 32-byte patterns on x86, we only need to 3978 // align to 4 bytes, the element size and alignment requirement. 3979 3980 constants.foreach([&](int imm, A::Label* label) { 3981 a->align(4); 3982 a->label(label); 3983 for (int i = 0; i < K; i++) { 3984 a->word(imm); 3985 } 3986 }); 3987 3988 if (!iota.references.empty()) { 3989 a->align(4); 3990 a->label(&iota); // 0,1,2,3,4,... 3991 for (int i = 0; i < K; i++) { 3992 a->word(i); 3993 } 3994 } 3995 3996 if (!load64_index.references.empty()) { 3997 a->align(4); 3998 a->label(&load64_index); // {0,2,4,6|1,3,5,7} 3999 a->word(0); a->word(2); a->word(4); a->word(6); 4000 a->word(1); a->word(3); a->word(5); a->word(7); 4001 } 4002 4003 return true; 4004 } 4005 setupJIT(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)4006 void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions, 4007 const char* debug_name) { 4008 // Assemble with no buffer to determine a.size() (the number of bytes we'll assemble) 4009 // and stack_hint/registers_used to feed forward into the next jit() call. 4010 Assembler a{nullptr}; 4011 int stack_hint = -1; 4012 uint32_t registers_used = 0xffff'ffff; // Start conservatively with all. 4013 if (!this->jit(instructions, &stack_hint, ®isters_used, &a)) { 4014 return; 4015 } 4016 4017 fImpl->jit_size = a.size(); 4018 void* jit_entry = alloc_jit_buffer(&fImpl->jit_size); 4019 fImpl->jit_entry.store(jit_entry); 4020 4021 // Assemble the program for real with stack_hint/registers_used as feedback from first call. 4022 a = Assembler{jit_entry}; 4023 SkAssertResult(this->jit(instructions, &stack_hint, ®isters_used, &a)); 4024 SkASSERT(a.size() <= fImpl->jit_size); 4025 4026 // Remap as executable, and flush caches on platforms that need that. 4027 remap_as_executable(jit_entry, fImpl->jit_size); 4028 4029 #if !defined(SK_BUILD_FOR_WIN) 4030 // For profiling and debugging, it's helpful to have this code loaded 4031 // dynamically rather than just jumping info fImpl->jit_entry. 4032 if (gSkVMJITViaDylib) { 4033 // Dump the raw program binary. 4034 SkString path = SkStringPrintf("/tmp/%s.XXXXXX", debug_name); 4035 int fd = mkstemp(path.data()); 4036 ::write(fd, jit_entry, a.size()); 4037 close(fd); 4038 4039 this->dropJIT(); // (unmap and null out fImpl->jit_entry.) 4040 4041 // Convert it in-place to a dynamic library with a single symbol "skvm_jit": 4042 SkString cmd = SkStringPrintf( 4043 "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'" 4044 " | clang -x assembler -shared - -o %s", 4045 path.c_str(), path.c_str()); 4046 #if defined(__aarch64__) 4047 cmd.append(" -arch arm64"); 4048 #endif 4049 system(cmd.c_str()); 4050 4051 // Load that dynamic library and look up skvm_jit(). 4052 fImpl->dylib = dlopen(path.c_str(), RTLD_NOW|RTLD_LOCAL); 4053 void* sym = nullptr; 4054 for (const char* name : {"skvm_jit", "_skvm_jit"} ) { 4055 if (!sym) { sym = dlsym(fImpl->dylib, name); } 4056 } 4057 fImpl->jit_entry.store(sym); 4058 } 4059 #endif 4060 } 4061 disassemble(SkWStream * o) const4062 void Program::disassemble(SkWStream* o) const { 4063 #if !defined(SK_BUILD_FOR_WIN) 4064 SkDebugfStream debug; 4065 if (!o) { o = &debug; } 4066 4067 const void* jit_entry = fImpl->jit_entry.load(); 4068 size_t jit_size = fImpl->jit_size; 4069 4070 if (!jit_entry) { 4071 o->writeText("Program not JIT'd. Did you pass --jit?\n"); 4072 return; 4073 } 4074 4075 char path[] = "/tmp/skvm-jit.XXXXXX"; 4076 int fd = mkstemp(path); 4077 ::write(fd, jit_entry, jit_size); 4078 close(fd); 4079 4080 // Convert it in-place to a dynamic library with a single symbol "skvm_jit": 4081 SkString cmd = SkStringPrintf( 4082 "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'" 4083 " | clang -x assembler -shared - -o %s", 4084 path, path); 4085 #if defined(__aarch64__) 4086 cmd.append(" -arch arm64"); 4087 #endif 4088 system(cmd.c_str()); 4089 4090 // Now objdump to disassemble our function: 4091 // TODO: We could trim this down to just our code using '--disassemble=<symbol name>`, 4092 // but the symbol name varies with OS, and that option may be missing from objdump on some 4093 // machines? There also apears to be quite a bit of junk after the end of the JIT'd code. 4094 // Trimming that would let us pass '--visualize-jumps' and get the loop annotated. 4095 // With the junk, we tend to end up with a bunch of stray jumps that pollute the ASCII art. 4096 cmd = SkStringPrintf("objdump -D %s", path); 4097 #if defined(SK_BUILD_FOR_UNIX) 4098 cmd.append(" --section=.text"); 4099 #endif 4100 FILE* fp = popen(cmd.c_str(), "r"); 4101 if (!fp) { 4102 o->writeText("objdump failed\n"); 4103 return; 4104 } 4105 4106 char line[1024]; 4107 while (fgets(line, sizeof(line), fp)) { 4108 o->writeText(line); 4109 } 4110 4111 pclose(fp); 4112 #endif 4113 } 4114 4115 #endif 4116 4117 } // namespace skvm 4118