1 // Copyright 2020 Google LLC.
2 // Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
3
4 #ifndef SkVM_opts_DEFINED
5 #define SkVM_opts_DEFINED
6
7 #include "src/base/SkVx.h"
8 #include "src/core/SkVM.h"
9 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
10 #include <immintrin.h>
11 #endif
12
13 template <int N>
gather32(const int * ptr,const skvx::Vec<N,int> & ix)14 static inline skvx::Vec<N,int> gather32(const int* ptr, const skvx::Vec<N,int>& ix) {
15 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
16 if constexpr (N == 8) {
17 return skvx::bit_pun<skvx::Vec<N,int>>(
18 _mm256_i32gather_epi32(ptr, skvx::bit_pun<__m256i>(ix), 4));
19 }
20 #endif
21 // Try to recurse on specializations, falling back on standard scalar map()-based impl.
22 if constexpr (N > 8) {
23 return join(gather32(ptr, ix.lo),
24 gather32(ptr, ix.hi));
25 }
26 return map([&](int i) { return ptr[i]; }, ix);
27 }
28
29 namespace SK_OPTS_NS {
30
31 namespace SkVMInterpreterTypes {
32 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
33 constexpr inline int K = 32; // 1024-bit: 4 ymm or 2 zmm at a time
34 #else
35 constexpr inline int K = 8; // 256-bit: 2 xmm, 2 v-registers, etc.
36 #endif
37 using I32 = skvx::Vec<K, int>;
38 using I16 = skvx::Vec<K, int16_t>;
39 using F32 = skvx::Vec<K, float>;
40 using U64 = skvx::Vec<K, uint64_t>;
41 using U32 = skvx::Vec<K, uint32_t>;
42 using U16 = skvx::Vec<K, uint16_t>;
43 using U8 = skvx::Vec<K, uint8_t>;
44 union Slot {
45 F32 f32;
46 I32 i32;
47 U32 u32;
48 I16 i16;
49 U16 u16;
50 };
51 } // namespace SkVMInterpreterTypes
52
interpret_skvm(const skvm::InterpreterInstruction insts[],const int ninsts,const int nregs,const int loop,const int strides[],skvm::TraceHook * traceHooks[],const int nTraceHooks,const int nargs,int n,void * args[])53 inline void interpret_skvm(const skvm::InterpreterInstruction insts[], const int ninsts,
54 const int nregs, const int loop,
55 const int strides[],
56 skvm::TraceHook* traceHooks[], const int nTraceHooks,
57 const int nargs, int n, void* args[]) {
58 using namespace skvm;
59
60 using SkVMInterpreterTypes::K;
61 using SkVMInterpreterTypes::I32;
62 using SkVMInterpreterTypes::I16;
63 using SkVMInterpreterTypes::F32;
64 using SkVMInterpreterTypes::U64;
65 using SkVMInterpreterTypes::U32;
66 using SkVMInterpreterTypes::U16;
67 using SkVMInterpreterTypes::U8;
68 using SkVMInterpreterTypes::Slot;
69
70 // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
71
72 Slot few_regs[16];
73 std::unique_ptr<char[]> many_regs;
74
75 Slot* r = few_regs;
76
77 if (nregs > (int)std::size(few_regs)) {
78 // Annoyingly we can't trust that malloc() or new will work with Slot because
79 // the skvx::Vec types may have alignment greater than what they provide.
80 // We'll overallocate one extra register so we can align manually.
81 many_regs.reset(new char[ sizeof(Slot) * (nregs + 1) ]);
82
83 uintptr_t addr = (uintptr_t)many_regs.get();
84 addr += alignof(Slot) -
85 (addr & (alignof(Slot) - 1));
86 SkASSERT((addr & (alignof(Slot) - 1)) == 0);
87 r = (Slot*)addr;
88 }
89
90 const auto should_trace = [&](int stride, int immA, Reg x, Reg y) -> bool {
91 if (immA < 0 || immA >= nTraceHooks) {
92 return false;
93 }
94 // When stride == K, all lanes are used.
95 if (stride == K) {
96 return any(r[x].i32 & r[y].i32);
97 }
98 // When stride == 1, only the first lane is used; the rest are not meaningful.
99 return r[x].i32[0] & r[y].i32[0];
100 };
101
102 // Step each argument pointer ahead by its stride a number of times.
103 auto step_args = [&](int times) {
104 for (int i = 0; i < nargs; i++) {
105 args[i] = (void*)( (char*)args[i] + times * strides[i] );
106 }
107 };
108
109 int start = 0,
110 stride;
111 for ( ; n > 0; start = loop, n -= stride, step_args(stride)) {
112 stride = n >= K ? K : 1;
113
114 for (int instIdx = start; instIdx < ninsts; instIdx++) {
115 InterpreterInstruction inst = insts[instIdx];
116
117 // d = op(x,y,z,w, immA,immB)
118 Reg d = inst.d,
119 x = inst.x,
120 y = inst.y,
121 z = inst.z,
122 w = inst.w;
123 int immA = inst.immA,
124 immB = inst.immB,
125 immC = inst.immC;
126
127 // Ops that interact with memory need to know whether we're stride=1 or K,
128 // but all non-memory ops can run the same code no matter the stride.
129 switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
130 default: SkUNREACHABLE;
131
132 #define STRIDE_1(op) case 2*(int)op
133 #define STRIDE_K(op) case 2*(int)op + 1
134 STRIDE_1(Op::store8 ): memcpy(args[immA], &r[x].i32, 1); break;
135 STRIDE_1(Op::store16): memcpy(args[immA], &r[x].i32, 2); break;
136 STRIDE_1(Op::store32): memcpy(args[immA], &r[x].i32, 4); break;
137 STRIDE_1(Op::store64): memcpy((char*)args[immA]+0, &r[x].i32, 4);
138 memcpy((char*)args[immA]+4, &r[y].i32, 4); break;
139
140 STRIDE_K(Op::store8 ): skvx::cast<uint8_t> (r[x].i32).store(args[immA]); break;
141 STRIDE_K(Op::store16): skvx::cast<uint16_t>(r[x].i32).store(args[immA]); break;
142 STRIDE_K(Op::store32): (r[x].i32).store(args[immA]); break;
143 STRIDE_K(Op::store64): (skvx::cast<uint64_t>(r[x].u32) << 0 |
144 skvx::cast<uint64_t>(r[y].u32) << 32).store(args[immA]);
145 break;
146
147 STRIDE_1(Op::load8 ): r[d].i32 = 0; memcpy(&r[d].i32, args[immA], 1); break;
148 STRIDE_1(Op::load16): r[d].i32 = 0; memcpy(&r[d].i32, args[immA], 2); break;
149 STRIDE_1(Op::load32): r[d].i32 = 0; memcpy(&r[d].i32, args[immA], 4); break;
150 STRIDE_1(Op::load64):
151 r[d].i32 = 0; memcpy(&r[d].i32, (char*)args[immA] + 4*immB, 4); break;
152
153 STRIDE_K(Op::load8 ): r[d].i32= skvx::cast<int>(U8 ::Load(args[immA])); break;
154 STRIDE_K(Op::load16): r[d].i32= skvx::cast<int>(U16::Load(args[immA])); break;
155 STRIDE_K(Op::load32): r[d].i32= I32::Load(args[immA]) ; break;
156 STRIDE_K(Op::load64):
157 // Low 32 bits if immB=0, or high 32 bits if immB=1.
158 r[d].i32 = skvx::cast<int>(U64::Load(args[immA]) >> (32*immB)); break;
159
160 // The pointer we base our gather on is loaded indirectly from a uniform:
161 // - args[immA] is the uniform holding our gather base pointer somewhere;
162 // - (const uint8_t*)args[immA] + immB points to the gather base pointer;
163 // - memcpy() loads the gather base and into a pointer of the right type.
164 // After all that we have an ordinary (uniform) pointer `ptr` to load from,
165 // and we then gather from it using the varying indices in r[x].
166 STRIDE_1(Op::gather8): {
167 const uint8_t* ptr;
168 memcpy(&ptr, (const uint8_t*)args[immA] + immB, sizeof(ptr));
169 r[d].i32 = ptr[ r[x].i32[0] ];
170 } break;
171 STRIDE_1(Op::gather16): {
172 const uint16_t* ptr;
173 memcpy(&ptr, (const uint8_t*)args[immA] + immB, sizeof(ptr));
174 r[d].i32 = ptr[ r[x].i32[0] ];
175 } break;
176 STRIDE_1(Op::gather32): {
177 const int* ptr;
178 memcpy(&ptr, (const uint8_t*)args[immA] + immB, sizeof(ptr));
179 r[d].i32 = ptr[ r[x].i32[0] ];
180 } break;
181
182 STRIDE_K(Op::gather8): {
183 const uint8_t* ptr;
184 memcpy(&ptr, (const uint8_t*)args[immA] + immB, sizeof(ptr));
185 r[d].i32 = map([&](int ix) { return (int)ptr[ix]; }, r[x].i32);
186 } break;
187 STRIDE_K(Op::gather16): {
188 const uint16_t* ptr;
189 memcpy(&ptr, (const uint8_t*)args[immA] + immB, sizeof(ptr));
190 r[d].i32 = map([&](int ix) { return (int)ptr[ix]; }, r[x].i32);
191 } break;
192 STRIDE_K(Op::gather32): {
193 const int* ptr;
194 memcpy(&ptr, (const uint8_t*)args[immA] + immB, sizeof(ptr));
195 r[d].i32 = gather32(ptr, r[x].i32);
196 } break;
197
198 #undef STRIDE_1
199 #undef STRIDE_K
200
201 // Ops that don't interact with memory should never care about the stride.
202 #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
203
204 // These 128-bit ops are implemented serially for simplicity.
205 CASE(Op::store128): {
206 U64 lo = (skvx::cast<uint64_t>(r[x].u32) << 0 |
207 skvx::cast<uint64_t>(r[y].u32) << 32),
208 hi = (skvx::cast<uint64_t>(r[z].u32) << 0 |
209 skvx::cast<uint64_t>(r[w].u32) << 32);
210 for (int i = 0; i < stride; i++) {
211 memcpy((char*)args[immA] + 16*i + 0, &lo[i], 8);
212 memcpy((char*)args[immA] + 16*i + 8, &hi[i], 8);
213 }
214 } break;
215
216 CASE(Op::load128):
217 r[d].i32 = 0;
218 for (int i = 0; i < stride; i++) {
219 memcpy(&r[d].i32[i], (const char*)args[immA] + 16*i+ 4*immB, 4);
220 } break;
221
222 CASE(Op::assert_true):
223 #ifdef SK_DEBUG
224 if (!all(r[x].i32)) {
225 SkDebugf("inst %d, register %d\n", instIdx, y);
226 for (int i = 0; i < K; i++) {
227 SkDebugf("\t%2d: %08x (%g)\n",
228 instIdx, r[y].i32[instIdx], r[y].f32[instIdx]);
229 }
230 SkASSERT(false);
231 }
232 #endif
233 break;
234
235 CASE(Op::trace_line):
236 if (should_trace(stride, immA, x, y)) {
237 traceHooks[immA]->line(immB);
238 }
239 break;
240
241 CASE(Op::trace_var):
242 if (should_trace(stride, immA, x, y)) {
243 for (int i = 0; i < K; ++i) {
244 if (r[x].i32[i] & r[y].i32[i]) {
245 traceHooks[immA]->var(immB, r[z].i32[i]);
246 break;
247 }
248 }
249 }
250 break;
251
252 CASE(Op::trace_enter):
253 if (should_trace(stride, immA, x, y)) {
254 traceHooks[immA]->enter(immB);
255 }
256 break;
257
258 CASE(Op::trace_exit):
259 if (should_trace(stride, immA, x, y)) {
260 traceHooks[immA]->exit(immB);
261 }
262 break;
263
264 CASE(Op::trace_scope):
265 if (should_trace(stride, immA, x, y)) {
266 traceHooks[immA]->scope(immB);
267 }
268 break;
269
270 CASE(Op::index): {
271 const int iota[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
272 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
273 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
274 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 };
275 static_assert(K <= std::size(iota), "");
276
277 r[d].i32 = n - I32::Load(iota);
278 } break;
279
280 CASE(Op::uniform32):
281 r[d].i32 = *(const int*)( (const char*)args[immA] + immB );
282 break;
283
284 CASE(Op::array32):
285 const int* ptr;
286 memcpy(&ptr, (const uint8_t*)args[immA] + immB, sizeof(ptr));
287 r[d].i32 = ptr[immC/sizeof(int)];
288 break;
289
290 CASE(Op::splat): r[d].i32 = immA; break;
291
292 CASE(Op::add_f32): r[d].f32 = r[x].f32 + r[y].f32; break;
293 CASE(Op::sub_f32): r[d].f32 = r[x].f32 - r[y].f32; break;
294 CASE(Op::mul_f32): r[d].f32 = r[x].f32 * r[y].f32; break;
295 CASE(Op::div_f32): r[d].f32 = r[x].f32 / r[y].f32; break;
296 CASE(Op::min_f32): r[d].f32 = min(r[x].f32, r[y].f32); break;
297 CASE(Op::max_f32): r[d].f32 = max(r[x].f32, r[y].f32); break;
298
299 CASE(Op::fma_f32): r[d].f32 = fma( r[x].f32, r[y].f32, r[z].f32); break;
300 CASE(Op::fms_f32): r[d].f32 = fma( r[x].f32, r[y].f32, -r[z].f32); break;
301 CASE(Op::fnma_f32): r[d].f32 = fma(-r[x].f32, r[y].f32, r[z].f32); break;
302
303 CASE(Op::sqrt_f32): r[d].f32 = sqrt(r[x].f32); break;
304
305 CASE(Op::add_i32): r[d].i32 = r[x].i32 + r[y].i32; break;
306 CASE(Op::sub_i32): r[d].i32 = r[x].i32 - r[y].i32; break;
307 CASE(Op::mul_i32): r[d].i32 = r[x].i32 * r[y].i32; break;
308
309 CASE(Op::shl_i32): r[d].i32 = r[x].i32 << immA; break;
310 CASE(Op::sra_i32): r[d].i32 = r[x].i32 >> immA; break;
311 CASE(Op::shr_i32): r[d].u32 = r[x].u32 >> immA; break;
312
313 CASE(Op:: eq_f32): r[d].i32 = r[x].f32 == r[y].f32; break;
314 CASE(Op::neq_f32): r[d].i32 = r[x].f32 != r[y].f32; break;
315 CASE(Op:: gt_f32): r[d].i32 = r[x].f32 > r[y].f32; break;
316 CASE(Op::gte_f32): r[d].i32 = r[x].f32 >= r[y].f32; break;
317
318 CASE(Op:: eq_i32): r[d].i32 = r[x].i32 == r[y].i32; break;
319 CASE(Op:: gt_i32): r[d].i32 = r[x].i32 > r[y].i32; break;
320
321 CASE(Op::bit_and ): r[d].i32 = r[x].i32 & r[y].i32; break;
322 CASE(Op::bit_or ): r[d].i32 = r[x].i32 | r[y].i32; break;
323 CASE(Op::bit_xor ): r[d].i32 = r[x].i32 ^ r[y].i32; break;
324 CASE(Op::bit_clear): r[d].i32 = r[x].i32 & ~r[y].i32; break;
325
326 CASE(Op::select): r[d].i32 = skvx::if_then_else(r[x].i32, r[y].i32, r[z].i32);
327 break;
328
329 CASE(Op::ceil): r[d].f32 = skvx::ceil(r[x].f32) ; break;
330 CASE(Op::floor): r[d].f32 = skvx::floor(r[x].f32) ; break;
331 CASE(Op::to_f32): r[d].f32 = skvx::cast<float>( r[x].i32 ); break;
332 CASE(Op::trunc): r[d].i32 = skvx::cast<int> ( r[x].f32 ); break;
333 CASE(Op::round): r[d].i32 = skvx::cast<int> (skvx::lrint(r[x].f32)); break;
334
335 CASE(Op::to_fp16):
336 r[d].i32 = skvx::cast<int>(skvx::to_half(r[x].f32));
337 break;
338 CASE(Op::from_fp16):
339 r[d].f32 = skvx::from_half(skvx::cast<uint16_t>(r[x].i32));
340 break;
341
342 #undef CASE
343 }
344 }
345 }
346 }
347
348 } // namespace SK_OPTS_NS
349
350 #endif//SkVM_opts_DEFINED
351