1 /*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #ifndef SkRasterPipeline_opts_DEFINED
9 #define SkRasterPipeline_opts_DEFINED
10
11 #include "include/core/SkTypes.h"
12 #include "src/core/SkUtils.h" // unaligned_{load,store}
13 #include "src/sksl/SkSLByteCode.h"
14
15 // Every function in this file should be marked static and inline using SI.
16 #if defined(__clang__)
17 #define SI __attribute__((always_inline)) static inline
18 #else
19 #define SI static inline
20 #endif
21
22 template <typename Dst, typename Src>
bit_cast(const Src & src)23 SI Dst bit_cast(const Src& src) {
24 static_assert(sizeof(Dst) == sizeof(Src), "");
25 return sk_unaligned_load<Dst>(&src);
26 }
27
28 template <typename Dst, typename Src>
widen_cast(const Src & src)29 SI Dst widen_cast(const Src& src) {
30 static_assert(sizeof(Dst) > sizeof(Src), "");
31 Dst dst;
32 memcpy(&dst, &src, sizeof(Src));
33 return dst;
34 }
35
36 // Our program is an array of void*, either
37 // - 1 void* per stage with no context pointer, the next stage;
38 // - 2 void* per stage with a context pointer, first the context pointer, then the next stage.
39
40 // load_and_inc() steps the program forward by 1 void*, returning that pointer.
load_and_inc(void ** & program)41 SI void* load_and_inc(void**& program) {
42 #if defined(__GNUC__) && defined(__x86_64__)
43 // If program is in %rsi (we try to make this likely) then this is a single instruction.
44 void* rax;
45 asm("lodsq" : "=a"(rax), "+S"(program)); // Write-only %rax, read-write %rsi.
46 return rax;
47 #else
48 // On ARM *program++ compiles into pretty ideal code without any handholding.
49 return *program++;
50 #endif
51 }
52
53 // Lazily resolved on first cast. Does nothing if cast to Ctx::None.
54 struct Ctx {
55 struct None {};
56
57 void* ptr;
58 void**& program;
59
CtxCtx60 explicit Ctx(void**& p) : ptr(nullptr), program(p) {}
61
62 template <typename T>
63 operator T*() {
64 if (!ptr) { ptr = load_and_inc(program); }
65 return (T*)ptr;
66 }
NoneCtx67 operator None() { return None{}; }
68 };
69
70
71 #if !defined(__clang__)
72 #define JUMPER_IS_SCALAR
73 #elif defined(SK_ARM_HAS_NEON)
74 #define JUMPER_IS_NEON
75 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX512
76 #define JUMPER_IS_AVX512
77 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
78 #define JUMPER_IS_HSW
79 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
80 #define JUMPER_IS_AVX
81 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
82 #define JUMPER_IS_SSE41
83 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
84 #define JUMPER_IS_SSE2
85 #else
86 #define JUMPER_IS_SCALAR
87 #endif
88
89 // Older Clangs seem to crash when generating non-optimized NEON code for ARMv7.
90 #if defined(__clang__) && !defined(__OPTIMIZE__) && defined(SK_CPU_ARM32)
91 // Apple Clang 9 and vanilla Clang 5 are fine, and may even be conservative.
92 #if defined(__apple_build_version__) && __clang_major__ < 9
93 #define JUMPER_IS_SCALAR
94 #elif __clang_major__ < 5
95 #define JUMPER_IS_SCALAR
96 #endif
97
98 #if defined(JUMPER_IS_NEON) && defined(JUMPER_IS_SCALAR)
99 #undef JUMPER_IS_NEON
100 #endif
101 #endif
102
103 #if defined(JUMPER_IS_SCALAR)
104 #include <math.h>
105 #elif defined(JUMPER_IS_NEON)
106 #include <arm_neon.h>
107 #else
108 #include <immintrin.h>
109 #endif
110
111 namespace SK_OPTS_NS {
112
113 #if defined(JUMPER_IS_SCALAR)
114 // This path should lead to portable scalar code.
115 using F = float ;
116 using I32 = int32_t;
117 using U64 = uint64_t;
118 using U32 = uint32_t;
119 using U16 = uint16_t;
120 using U8 = uint8_t ;
121
mad(F f,F m,F a)122 SI F mad(F f, F m, F a) { return f*m+a; }
min(F a,F b)123 SI F min(F a, F b) { return fminf(a,b); }
max(F a,F b)124 SI F max(F a, F b) { return fmaxf(a,b); }
abs_(F v)125 SI F abs_ (F v) { return fabsf(v); }
floor_(F v)126 SI F floor_(F v) { return floorf(v); }
rcp(F v)127 SI F rcp (F v) { return 1.0f / v; }
rsqrt(F v)128 SI F rsqrt (F v) { return 1.0f / sqrtf(v); }
sqrt_(F v)129 SI F sqrt_(F v) { return sqrtf(v); }
round(F v,F scale)130 SI U32 round (F v, F scale) { return (uint32_t)(v*scale + 0.5f); }
pack(U32 v)131 SI U16 pack(U32 v) { return (U16)v; }
pack(U16 v)132 SI U8 pack(U16 v) { return (U8)v; }
133
if_then_else(I32 c,F t,F e)134 SI F if_then_else(I32 c, F t, F e) { return c ? t : e; }
135
136 template <typename T>
gather(const T * p,U32 ix)137 SI T gather(const T* p, U32 ix) { return p[ix]; }
138
load2(const uint16_t * ptr,size_t tail,U16 * r,U16 * g)139 SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
140 *r = ptr[0];
141 *g = ptr[1];
142 }
store2(uint16_t * ptr,size_t tail,U16 r,U16 g)143 SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
144 ptr[0] = r;
145 ptr[1] = g;
146 }
load3(const uint16_t * ptr,size_t tail,U16 * r,U16 * g,U16 * b)147 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
148 *r = ptr[0];
149 *g = ptr[1];
150 *b = ptr[2];
151 }
load4(const uint16_t * ptr,size_t tail,U16 * r,U16 * g,U16 * b,U16 * a)152 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
153 *r = ptr[0];
154 *g = ptr[1];
155 *b = ptr[2];
156 *a = ptr[3];
157 }
store4(uint16_t * ptr,size_t tail,U16 r,U16 g,U16 b,U16 a)158 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
159 ptr[0] = r;
160 ptr[1] = g;
161 ptr[2] = b;
162 ptr[3] = a;
163 }
164
load2(const float * ptr,size_t tail,F * r,F * g)165 SI void load2(const float* ptr, size_t tail, F* r, F* g) {
166 *r = ptr[0];
167 *g = ptr[1];
168 }
store2(float * ptr,size_t tail,F r,F g)169 SI void store2(float* ptr, size_t tail, F r, F g) {
170 ptr[0] = r;
171 ptr[1] = g;
172 }
load4(const float * ptr,size_t tail,F * r,F * g,F * b,F * a)173 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
174 *r = ptr[0];
175 *g = ptr[1];
176 *b = ptr[2];
177 *a = ptr[3];
178 }
store4(float * ptr,size_t tail,F r,F g,F b,F a)179 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
180 ptr[0] = r;
181 ptr[1] = g;
182 ptr[2] = b;
183 ptr[3] = a;
184 }
185
186 #elif defined(JUMPER_IS_NEON)
187 // Since we know we're using Clang, we can use its vector extensions.
188 template <typename T> using V = T __attribute__((ext_vector_type(4)));
189 using F = V<float >;
190 using I32 = V< int32_t>;
191 using U64 = V<uint64_t>;
192 using U32 = V<uint32_t>;
193 using U16 = V<uint16_t>;
194 using U8 = V<uint8_t >;
195
196 // We polyfill a few routines that Clang doesn't build into ext_vector_types.
197 SI F min(F a, F b) { return vminq_f32(a,b); }
198 SI F max(F a, F b) { return vmaxq_f32(a,b); }
199 SI F abs_ (F v) { return vabsq_f32(v); }
200 SI F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
201 SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
202 SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
203 SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
204
205 SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
206
207 #if defined(SK_CPU_ARM64)
208 SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
209 SI F floor_(F v) { return vrndmq_f32(v); }
210 SI F sqrt_(F v) { return vsqrtq_f32(v); }
211 SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
212 #else
213 SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); }
214 SI F floor_(F v) {
215 F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
216 return roundtrip - if_then_else(roundtrip > v, 1, 0);
217 }
218
219 SI F sqrt_(F v) {
220 auto e = vrsqrteq_f32(v); // Estimate and two refinement steps for e = rsqrt(v).
221 e *= vrsqrtsq_f32(v,e*e);
222 e *= vrsqrtsq_f32(v,e*e);
223 return v*e; // sqrt(v) == v*rsqrt(v).
224 }
225
226 SI U32 round(F v, F scale) {
227 return vcvtq_u32_f32(mad(v,scale,0.5f));
228 }
229 #endif
230
231
232 template <typename T>
233 SI V<T> gather(const T* p, U32 ix) {
234 return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
235 }
236 SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
237 uint16x4x2_t rg;
238 if (__builtin_expect(tail,0)) {
239 if ( true ) { rg = vld2_lane_u16(ptr + 0, rg, 0); }
240 if (tail > 1) { rg = vld2_lane_u16(ptr + 2, rg, 1); }
241 if (tail > 2) { rg = vld2_lane_u16(ptr + 4, rg, 2); }
242 } else {
243 rg = vld2_u16(ptr);
244 }
245 *r = rg.val[0];
246 *g = rg.val[1];
247 }
248 SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
249 if (__builtin_expect(tail,0)) {
250 if ( true ) { vst2_lane_u16(ptr + 0, (uint16x4x2_t{{r,g}}), 0); }
251 if (tail > 1) { vst2_lane_u16(ptr + 2, (uint16x4x2_t{{r,g}}), 1); }
252 if (tail > 2) { vst2_lane_u16(ptr + 4, (uint16x4x2_t{{r,g}}), 2); }
253 } else {
254 vst2_u16(ptr, (uint16x4x2_t{{r,g}}));
255 }
256 }
257 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
258 uint16x4x3_t rgb;
259 if (__builtin_expect(tail,0)) {
260 if ( true ) { rgb = vld3_lane_u16(ptr + 0, rgb, 0); }
261 if (tail > 1) { rgb = vld3_lane_u16(ptr + 3, rgb, 1); }
262 if (tail > 2) { rgb = vld3_lane_u16(ptr + 6, rgb, 2); }
263 } else {
264 rgb = vld3_u16(ptr);
265 }
266 *r = rgb.val[0];
267 *g = rgb.val[1];
268 *b = rgb.val[2];
269 }
270 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
271 uint16x4x4_t rgba;
272 if (__builtin_expect(tail,0)) {
273 if ( true ) { rgba = vld4_lane_u16(ptr + 0, rgba, 0); }
274 if (tail > 1) { rgba = vld4_lane_u16(ptr + 4, rgba, 1); }
275 if (tail > 2) { rgba = vld4_lane_u16(ptr + 8, rgba, 2); }
276 } else {
277 rgba = vld4_u16(ptr);
278 }
279 *r = rgba.val[0];
280 *g = rgba.val[1];
281 *b = rgba.val[2];
282 *a = rgba.val[3];
283 }
284
285 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
286 if (__builtin_expect(tail,0)) {
287 if ( true ) { vst4_lane_u16(ptr + 0, (uint16x4x4_t{{r,g,b,a}}), 0); }
288 if (tail > 1) { vst4_lane_u16(ptr + 4, (uint16x4x4_t{{r,g,b,a}}), 1); }
289 if (tail > 2) { vst4_lane_u16(ptr + 8, (uint16x4x4_t{{r,g,b,a}}), 2); }
290 } else {
291 vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
292 }
293 }
294 SI void load2(const float* ptr, size_t tail, F* r, F* g) {
295 float32x4x2_t rg;
296 if (__builtin_expect(tail,0)) {
297 if ( true ) { rg = vld2q_lane_f32(ptr + 0, rg, 0); }
298 if (tail > 1) { rg = vld2q_lane_f32(ptr + 2, rg, 1); }
299 if (tail > 2) { rg = vld2q_lane_f32(ptr + 4, rg, 2); }
300 } else {
301 rg = vld2q_f32(ptr);
302 }
303 *r = rg.val[0];
304 *g = rg.val[1];
305 }
306 SI void store2(float* ptr, size_t tail, F r, F g) {
307 if (__builtin_expect(tail,0)) {
308 if ( true ) { vst2q_lane_f32(ptr + 0, (float32x4x2_t{{r,g}}), 0); }
309 if (tail > 1) { vst2q_lane_f32(ptr + 2, (float32x4x2_t{{r,g}}), 1); }
310 if (tail > 2) { vst2q_lane_f32(ptr + 4, (float32x4x2_t{{r,g}}), 2); }
311 } else {
312 vst2q_f32(ptr, (float32x4x2_t{{r,g}}));
313 }
314 }
315 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
316 float32x4x4_t rgba;
317 if (__builtin_expect(tail,0)) {
318 if ( true ) { rgba = vld4q_lane_f32(ptr + 0, rgba, 0); }
319 if (tail > 1) { rgba = vld4q_lane_f32(ptr + 4, rgba, 1); }
320 if (tail > 2) { rgba = vld4q_lane_f32(ptr + 8, rgba, 2); }
321 } else {
322 rgba = vld4q_f32(ptr);
323 }
324 *r = rgba.val[0];
325 *g = rgba.val[1];
326 *b = rgba.val[2];
327 *a = rgba.val[3];
328 }
329 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
330 if (__builtin_expect(tail,0)) {
331 if ( true ) { vst4q_lane_f32(ptr + 0, (float32x4x4_t{{r,g,b,a}}), 0); }
332 if (tail > 1) { vst4q_lane_f32(ptr + 4, (float32x4x4_t{{r,g,b,a}}), 1); }
333 if (tail > 2) { vst4q_lane_f32(ptr + 8, (float32x4x4_t{{r,g,b,a}}), 2); }
334 } else {
335 vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
336 }
337 }
338
339 #elif defined(JUMPER_IS_AVX) || defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
340 // These are __m256 and __m256i, but friendlier and strongly-typed.
341 template <typename T> using V = T __attribute__((ext_vector_type(8)));
342 using F = V<float >;
343 using I32 = V< int32_t>;
344 using U64 = V<uint64_t>;
345 using U32 = V<uint32_t>;
346 using U16 = V<uint16_t>;
347 using U8 = V<uint8_t >;
348
349 SI F mad(F f, F m, F a) {
350 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
351 return _mm256_fmadd_ps(f,m,a);
352 #else
353 return f*m+a;
354 #endif
355 }
356
357 SI F min(F a, F b) { return _mm256_min_ps(a,b); }
358 SI F max(F a, F b) { return _mm256_max_ps(a,b); }
359 SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); }
360 SI F floor_(F v) { return _mm256_floor_ps(v); }
361 SI F rcp (F v) { return _mm256_rcp_ps (v); }
362 SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); }
363 SI F sqrt_(F v) { return _mm256_sqrt_ps (v); }
364 SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
365
366 SI U16 pack(U32 v) {
367 return _mm_packus_epi32(_mm256_extractf128_si256(v, 0),
368 _mm256_extractf128_si256(v, 1));
369 }
370 SI U8 pack(U16 v) {
371 auto r = _mm_packus_epi16(v,v);
372 return sk_unaligned_load<U8>(&r);
373 }
374
375 SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
376
377 template <typename T>
378 SI V<T> gather(const T* p, U32 ix) {
379 return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
380 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
381 }
382 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
383 SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps (p, ix, 4); }
384 SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); }
385 SI U64 gather(const uint64_t* p, U32 ix) {
386 __m256i parts[] = {
387 _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,0), 8),
388 _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,1), 8),
389 };
390 return bit_cast<U64>(parts);
391 }
392 #endif
393
394 SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
395 U16 _0123, _4567;
396 if (__builtin_expect(tail,0)) {
397 _0123 = _4567 = _mm_setzero_si128();
398 auto* d = &_0123;
399 if (tail > 3) {
400 *d = _mm_loadu_si128(((__m128i*)ptr) + 0);
401 tail -= 4;
402 ptr += 8;
403 d = &_4567;
404 }
405 bool high = false;
406 if (tail > 1) {
407 *d = _mm_loadu_si64(ptr);
408 tail -= 2;
409 ptr += 4;
410 high = true;
411 }
412 if (tail > 0) {
413 (*d)[high ? 4 : 0] = *(ptr + 0);
414 (*d)[high ? 5 : 1] = *(ptr + 1);
415 }
416 } else {
417 _0123 = _mm_loadu_si128(((__m128i*)ptr) + 0);
418 _4567 = _mm_loadu_si128(((__m128i*)ptr) + 1);
419 }
420 *r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(_0123, 16), 16),
421 _mm_srai_epi32(_mm_slli_epi32(_4567, 16), 16));
422 *g = _mm_packs_epi32(_mm_srai_epi32(_0123, 16),
423 _mm_srai_epi32(_4567, 16));
424 }
425 SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
426 auto _0123 = _mm_unpacklo_epi16(r, g),
427 _4567 = _mm_unpackhi_epi16(r, g);
428 if (__builtin_expect(tail,0)) {
429 const auto* s = &_0123;
430 if (tail > 3) {
431 _mm_storeu_si128((__m128i*)ptr, *s);
432 s = &_4567;
433 tail -= 4;
434 ptr += 8;
435 }
436 bool high = false;
437 if (tail > 1) {
438 _mm_storel_epi64((__m128i*)ptr, *s);
439 ptr += 4;
440 tail -= 2;
441 high = true;
442 }
443 if (tail > 0) {
444 if (high) {
445 *(int32_t*)ptr = _mm_extract_epi32(*s, 2);
446 } else {
447 *(int32_t*)ptr = _mm_cvtsi128_si32(*s);
448 }
449 }
450 } else {
451 _mm_storeu_si128((__m128i*)ptr + 0, _0123);
452 _mm_storeu_si128((__m128i*)ptr + 1, _4567);
453 }
454 }
455
456 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
457 __m128i _0,_1,_2,_3,_4,_5,_6,_7;
458 if (__builtin_expect(tail,0)) {
459 auto load_rgb = [](const uint16_t* src) {
460 auto v = _mm_cvtsi32_si128(*(const uint32_t*)src);
461 return _mm_insert_epi16(v, src[2], 2);
462 };
463 _1 = _2 = _3 = _4 = _5 = _6 = _7 = _mm_setzero_si128();
464 if ( true ) { _0 = load_rgb(ptr + 0); }
465 if (tail > 1) { _1 = load_rgb(ptr + 3); }
466 if (tail > 2) { _2 = load_rgb(ptr + 6); }
467 if (tail > 3) { _3 = load_rgb(ptr + 9); }
468 if (tail > 4) { _4 = load_rgb(ptr + 12); }
469 if (tail > 5) { _5 = load_rgb(ptr + 15); }
470 if (tail > 6) { _6 = load_rgb(ptr + 18); }
471 } else {
472 // Load 0+1, 2+3, 4+5 normally, and 6+7 backed up 4 bytes so we don't run over.
473 auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) ;
474 auto _23 = _mm_loadu_si128((const __m128i*)(ptr + 6)) ;
475 auto _45 = _mm_loadu_si128((const __m128i*)(ptr + 12)) ;
476 auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 16)), 4);
477 _0 = _01; _1 = _mm_srli_si128(_01, 6);
478 _2 = _23; _3 = _mm_srli_si128(_23, 6);
479 _4 = _45; _5 = _mm_srli_si128(_45, 6);
480 _6 = _67; _7 = _mm_srli_si128(_67, 6);
481 }
482
483 auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx
484 _13 = _mm_unpacklo_epi16(_1, _3),
485 _46 = _mm_unpacklo_epi16(_4, _6),
486 _57 = _mm_unpacklo_epi16(_5, _7);
487
488 auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
489 bx0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 xx xx xx xx
490 rg4567 = _mm_unpacklo_epi16(_46, _57),
491 bx4567 = _mm_unpackhi_epi16(_46, _57);
492
493 *r = _mm_unpacklo_epi64(rg0123, rg4567);
494 *g = _mm_unpackhi_epi64(rg0123, rg4567);
495 *b = _mm_unpacklo_epi64(bx0123, bx4567);
496 }
497 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
498 __m128i _01, _23, _45, _67;
499 if (__builtin_expect(tail,0)) {
500 auto src = (const double*)ptr;
501 _01 = _23 = _45 = _67 = _mm_setzero_si128();
502 if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
503 if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
504 if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
505 if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
506 if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
507 if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
508 if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
509 } else {
510 _01 = _mm_loadu_si128(((__m128i*)ptr) + 0);
511 _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
512 _45 = _mm_loadu_si128(((__m128i*)ptr) + 2);
513 _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
514 }
515
516 auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
517 _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
518 _46 = _mm_unpacklo_epi16(_45, _67),
519 _57 = _mm_unpackhi_epi16(_45, _67);
520
521 auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
522 ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3
523 rg4567 = _mm_unpacklo_epi16(_46, _57),
524 ba4567 = _mm_unpackhi_epi16(_46, _57);
525
526 *r = _mm_unpacklo_epi64(rg0123, rg4567);
527 *g = _mm_unpackhi_epi64(rg0123, rg4567);
528 *b = _mm_unpacklo_epi64(ba0123, ba4567);
529 *a = _mm_unpackhi_epi64(ba0123, ba4567);
530 }
531 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
532 auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3
533 rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7
534 ba0123 = _mm_unpacklo_epi16(b, a),
535 ba4567 = _mm_unpackhi_epi16(b, a);
536
537 auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
538 _23 = _mm_unpackhi_epi32(rg0123, ba0123),
539 _45 = _mm_unpacklo_epi32(rg4567, ba4567),
540 _67 = _mm_unpackhi_epi32(rg4567, ba4567);
541
542 if (__builtin_expect(tail,0)) {
543 auto dst = (double*)ptr;
544 if (tail > 0) { _mm_storel_pd(dst+0, _01); }
545 if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
546 if (tail > 2) { _mm_storel_pd(dst+2, _23); }
547 if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
548 if (tail > 4) { _mm_storel_pd(dst+4, _45); }
549 if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
550 if (tail > 6) { _mm_storel_pd(dst+6, _67); }
551 } else {
552 _mm_storeu_si128((__m128i*)ptr + 0, _01);
553 _mm_storeu_si128((__m128i*)ptr + 1, _23);
554 _mm_storeu_si128((__m128i*)ptr + 2, _45);
555 _mm_storeu_si128((__m128i*)ptr + 3, _67);
556 }
557 }
558
559 SI void load2(const float* ptr, size_t tail, F* r, F* g) {
560 F _0123, _4567;
561 if (__builtin_expect(tail, 0)) {
562 _0123 = _4567 = _mm256_setzero_ps();
563 F* d = &_0123;
564 if (tail > 3) {
565 *d = _mm256_loadu_ps(ptr);
566 ptr += 8;
567 tail -= 4;
568 d = &_4567;
569 }
570 bool high = false;
571 if (tail > 1) {
572 *d = _mm256_castps128_ps256(_mm_loadu_ps(ptr));
573 ptr += 4;
574 tail -= 2;
575 high = true;
576 }
577 if (tail > 0) {
578 *d = high ? _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 1)
579 : _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 0);
580 }
581 } else {
582 _0123 = _mm256_loadu_ps(ptr + 0);
583 _4567 = _mm256_loadu_ps(ptr + 8);
584 }
585
586 F _0145 = _mm256_permute2f128_pd(_0123, _4567, 0x20),
587 _2367 = _mm256_permute2f128_pd(_0123, _4567, 0x31);
588
589 *r = _mm256_shuffle_ps(_0145, _2367, 0x88);
590 *g = _mm256_shuffle_ps(_0145, _2367, 0xDD);
591 }
592 SI void store2(float* ptr, size_t tail, F r, F g) {
593 F _0145 = _mm256_unpacklo_ps(r, g),
594 _2367 = _mm256_unpackhi_ps(r, g);
595 F _0123 = _mm256_permute2f128_pd(_0145, _2367, 0x20),
596 _4567 = _mm256_permute2f128_pd(_0145, _2367, 0x31);
597
598 if (__builtin_expect(tail, 0)) {
599 const __m256* s = &_0123;
600 if (tail > 3) {
601 _mm256_storeu_ps(ptr, *s);
602 s = &_4567;
603 tail -= 4;
604 ptr += 8;
605 }
606 bool high = false;
607 if (tail > 1) {
608 _mm_storeu_ps(ptr, _mm256_extractf128_ps(*s, 0));
609 ptr += 4;
610 tail -= 2;
611 high = true;
612 }
613 if (tail > 0) {
614 *(ptr + 0) = (*s)[ high ? 4 : 0];
615 *(ptr + 1) = (*s)[ high ? 5 : 1];
616 }
617 } else {
618 _mm256_storeu_ps(ptr + 0, _0123);
619 _mm256_storeu_ps(ptr + 8, _4567);
620 }
621 }
622
623 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
624 F _04, _15, _26, _37;
625 _04 = _15 = _26 = _37 = 0;
626 switch (tail) {
627 case 0: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1);
628 case 7: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1);
629 case 6: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1);
630 case 5: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1);
631 case 4: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+12), 0);
632 case 3: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ 8), 0);
633 case 2: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ 4), 0);
634 case 1: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ 0), 0);
635 }
636
637 F rg0145 = _mm256_unpacklo_ps(_04,_15), // r0 r1 g0 g1 | r4 r5 g4 g5
638 ba0145 = _mm256_unpackhi_ps(_04,_15),
639 rg2367 = _mm256_unpacklo_ps(_26,_37),
640 ba2367 = _mm256_unpackhi_ps(_26,_37);
641
642 *r = _mm256_unpacklo_pd(rg0145, rg2367);
643 *g = _mm256_unpackhi_pd(rg0145, rg2367);
644 *b = _mm256_unpacklo_pd(ba0145, ba2367);
645 *a = _mm256_unpackhi_pd(ba0145, ba2367);
646 }
647 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
648 F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5
649 rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ...
650 ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5
651 ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ...
652
653 F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4
654 _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ...
655 _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ...
656 _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ...
657
658 if (__builtin_expect(tail, 0)) {
659 if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); }
660 if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); }
661 if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); }
662 if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); }
663 if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); }
664 if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); }
665 if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); }
666 } else {
667 F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo
668 _23 = _mm256_permute2f128_ps(_26, _37, 32),
669 _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi
670 _67 = _mm256_permute2f128_ps(_26, _37, 49);
671 _mm256_storeu_ps(ptr+ 0, _01);
672 _mm256_storeu_ps(ptr+ 8, _23);
673 _mm256_storeu_ps(ptr+16, _45);
674 _mm256_storeu_ps(ptr+24, _67);
675 }
676 }
677
678 #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41)
679 template <typename T> using V = T __attribute__((ext_vector_type(4)));
680 using F = V<float >;
681 using I32 = V< int32_t>;
682 using U64 = V<uint64_t>;
683 using U32 = V<uint32_t>;
684 using U16 = V<uint16_t>;
685 using U8 = V<uint8_t >;
686
687 SI F mad(F f, F m, F a) { return f*m+a; }
688 SI F min(F a, F b) { return _mm_min_ps(a,b); }
689 SI F max(F a, F b) { return _mm_max_ps(a,b); }
690 SI F abs_(F v) { return _mm_and_ps(v, 0-v); }
691 SI F rcp (F v) { return _mm_rcp_ps (v); }
692 SI F rsqrt (F v) { return _mm_rsqrt_ps(v); }
693 SI F sqrt_(F v) { return _mm_sqrt_ps (v); }
694 SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
695
696 SI U16 pack(U32 v) {
697 #if defined(JUMPER_IS_SSE41)
698 auto p = _mm_packus_epi32(v,v);
699 #else
700 // Sign extend so that _mm_packs_epi32() does the pack we want.
701 auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16);
702 p = _mm_packs_epi32(p,p);
703 #endif
704 return sk_unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
705 }
706 SI U8 pack(U16 v) {
707 auto r = widen_cast<__m128i>(v);
708 r = _mm_packus_epi16(r,r);
709 return sk_unaligned_load<U8>(&r);
710 }
711
712 SI F if_then_else(I32 c, F t, F e) {
713 return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e));
714 }
715
716 SI F floor_(F v) {
717 #if defined(JUMPER_IS_SSE41)
718 return _mm_floor_ps(v);
719 #else
720 F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
721 return roundtrip - if_then_else(roundtrip > v, 1, 0);
722 #endif
723 }
724
725 template <typename T>
726 SI V<T> gather(const T* p, U32 ix) {
727 return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
728 }
729
730 SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
731 __m128i _01;
732 if (__builtin_expect(tail,0)) {
733 _01 = _mm_setzero_si128();
734 if (tail > 1) {
735 _01 = _mm_loadl_pd(_01, (const double*)ptr); // r0 g0 r1 g1 00 00 00 00
736 if (tail > 2) {
737 _01 = _mm_loadh_pi(_01, (__m64 const* )(ptr + 4)); // r0 g0 r1 g1 r2 g2 00 00
738 }
739 } else {
740 _01 = _mm_loadl_pi(_01, (__m64 const*)ptr + 0); // r0 g0 00 00 00 00 00 00
741 }
742 } else {
743 _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 r1 g1 r2 g2 r3 g3
744 }
745 auto rg01_23 = _mm_shufflelo_epi16(_01, 0xD8); // r0 r1 g0 g1 r2 g2 r3 g3
746 auto rg = _mm_shufflehi_epi16(rg01_23, 0xD8); // r0 r1 g0 g1 r2 r3 g2 g3
747
748 auto R = _mm_shuffle_epi32(rg, 0x88); // r0 r1 r2 r3 r0 r1 r2 r3
749 auto G = _mm_shuffle_epi32(rg, 0xDD); // g0 g1 g2 g3 g0 g1 g2 g3
750 *r = sk_unaligned_load<U16>(&R);
751 *g = sk_unaligned_load<U16>(&G);
752 }
753 SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
754 U32 rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g));
755 if (__builtin_expect(tail, 0)) {
756 if (tail > 1) {
757 _mm_storel_epi64((__m128i*)ptr, rg);
758 if (tail > 2) {
759 int32_t rgpair = rg[2];
760 memcpy(ptr + 4, &rgpair, sizeof(rgpair));
761 }
762 } else {
763 int32_t rgpair = rg[0];
764 memcpy(ptr, &rgpair, sizeof(rgpair));
765 }
766 } else {
767 _mm_storeu_si128((__m128i*)ptr + 0, rg);
768 }
769 }
770
771 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
772 __m128i _0, _1, _2, _3;
773 if (__builtin_expect(tail,0)) {
774 _1 = _2 = _3 = _mm_setzero_si128();
775 auto load_rgb = [](const uint16_t* src) {
776 auto v = _mm_cvtsi32_si128(*(const uint32_t*)src);
777 return _mm_insert_epi16(v, src[2], 2);
778 };
779 if ( true ) { _0 = load_rgb(ptr + 0); }
780 if (tail > 1) { _1 = load_rgb(ptr + 3); }
781 if (tail > 2) { _2 = load_rgb(ptr + 6); }
782 } else {
783 // Load slightly weirdly to make sure we don't load past the end of 4x48 bits.
784 auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) ,
785 _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4);
786
787 // Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored).
788 _0 = _01;
789 _1 = _mm_srli_si128(_01, 6);
790 _2 = _23;
791 _3 = _mm_srli_si128(_23, 6);
792 }
793
794 // De-interlace to R,G,B.
795 auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx
796 _13 = _mm_unpacklo_epi16(_1, _3); // r1 r3 g1 g3 b1 b3 xx xx
797
798 auto R = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
799 G = _mm_srli_si128(R, 8),
800 B = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 xx xx xx xx
801
802 *r = sk_unaligned_load<U16>(&R);
803 *g = sk_unaligned_load<U16>(&G);
804 *b = sk_unaligned_load<U16>(&B);
805 }
806
807 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
808 __m128i _01, _23;
809 if (__builtin_expect(tail,0)) {
810 _01 = _23 = _mm_setzero_si128();
811 auto src = (const double*)ptr;
812 if ( true ) { _01 = _mm_loadl_pd(_01, src + 0); } // r0 g0 b0 a0 00 00 00 00
813 if (tail > 1) { _01 = _mm_loadh_pd(_01, src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1
814 if (tail > 2) { _23 = _mm_loadl_pd(_23, src + 2); } // r2 g2 b2 a2 00 00 00 00
815 } else {
816 _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 b0 a0 r1 g1 b1 a1
817 _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3
818 }
819
820 auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
821 _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3
822
823 auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
824 ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3
825
826 *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0);
827 *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4);
828 *b = sk_unaligned_load<U16>((uint16_t*)&ba + 0);
829 *a = sk_unaligned_load<U16>((uint16_t*)&ba + 4);
830 }
831
832 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
833 auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
834 ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a));
835
836 if (__builtin_expect(tail, 0)) {
837 auto dst = (double*)ptr;
838 if ( true ) { _mm_storel_pd(dst + 0, _mm_unpacklo_epi32(rg, ba)); }
839 if (tail > 1) { _mm_storeh_pd(dst + 1, _mm_unpacklo_epi32(rg, ba)); }
840 if (tail > 2) { _mm_storel_pd(dst + 2, _mm_unpackhi_epi32(rg, ba)); }
841 } else {
842 _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
843 _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
844 }
845 }
846
847 SI void load2(const float* ptr, size_t tail, F* r, F* g) {
848 F _01, _23;
849 if (__builtin_expect(tail, 0)) {
850 _01 = _23 = _mm_setzero_si128();
851 if ( true ) { _01 = _mm_loadl_pi(_01, (__m64 const*)(ptr + 0)); }
852 if (tail > 1) { _01 = _mm_loadh_pi(_01, (__m64 const*)(ptr + 2)); }
853 if (tail > 2) { _23 = _mm_loadl_pi(_23, (__m64 const*)(ptr + 4)); }
854 } else {
855 _01 = _mm_loadu_ps(ptr + 0);
856 _23 = _mm_loadu_ps(ptr + 4);
857 }
858 *r = _mm_shuffle_ps(_01, _23, 0x88);
859 *g = _mm_shuffle_ps(_01, _23, 0xDD);
860 }
861 SI void store2(float* ptr, size_t tail, F r, F g) {
862 F _01 = _mm_unpacklo_ps(r, g),
863 _23 = _mm_unpackhi_ps(r, g);
864 if (__builtin_expect(tail, 0)) {
865 if ( true ) { _mm_storel_pi((__m64*)(ptr + 0), _01); }
866 if (tail > 1) { _mm_storeh_pi((__m64*)(ptr + 2), _01); }
867 if (tail > 2) { _mm_storel_pi((__m64*)(ptr + 4), _23); }
868 } else {
869 _mm_storeu_ps(ptr + 0, _01);
870 _mm_storeu_ps(ptr + 4, _23);
871 }
872 }
873
874 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
875 F _0, _1, _2, _3;
876 if (__builtin_expect(tail, 0)) {
877 _1 = _2 = _3 = _mm_setzero_si128();
878 if ( true ) { _0 = _mm_loadu_ps(ptr + 0); }
879 if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); }
880 if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); }
881 } else {
882 _0 = _mm_loadu_ps(ptr + 0);
883 _1 = _mm_loadu_ps(ptr + 4);
884 _2 = _mm_loadu_ps(ptr + 8);
885 _3 = _mm_loadu_ps(ptr +12);
886 }
887 _MM_TRANSPOSE4_PS(_0,_1,_2,_3);
888 *r = _0;
889 *g = _1;
890 *b = _2;
891 *a = _3;
892 }
893
894 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
895 _MM_TRANSPOSE4_PS(r,g,b,a);
896 if (__builtin_expect(tail, 0)) {
897 if ( true ) { _mm_storeu_ps(ptr + 0, r); }
898 if (tail > 1) { _mm_storeu_ps(ptr + 4, g); }
899 if (tail > 2) { _mm_storeu_ps(ptr + 8, b); }
900 } else {
901 _mm_storeu_ps(ptr + 0, r);
902 _mm_storeu_ps(ptr + 4, g);
903 _mm_storeu_ps(ptr + 8, b);
904 _mm_storeu_ps(ptr +12, a);
905 }
906 }
907 #endif
908
909 // We need to be a careful with casts.
910 // (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
911 // These named casts and bit_cast() are always what they seem to be.
912 #if defined(JUMPER_IS_SCALAR)
cast(U32 v)913 SI F cast (U32 v) { return (F)v; }
cast64(U64 v)914 SI F cast64(U64 v) { return (F)v; }
trunc_(F v)915 SI U32 trunc_(F v) { return (U32)v; }
expand(U16 v)916 SI U32 expand(U16 v) { return (U32)v; }
expand(U8 v)917 SI U32 expand(U8 v) { return (U32)v; }
918 #else
cast(U32 v)919 SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
cast64(U64 v)920 SI F cast64(U64 v) { return __builtin_convertvector( v, F); }
trunc_(F v)921 SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); }
expand(U16 v)922 SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); }
expand(U8 v)923 SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
924 #endif
925
926 template <typename V>
if_then_else(I32 c,V t,V e)927 SI V if_then_else(I32 c, V t, V e) {
928 return bit_cast<V>(if_then_else(c, bit_cast<F>(t), bit_cast<F>(e)));
929 }
930
bswap(U16 x)931 SI U16 bswap(U16 x) {
932 #if defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41)
933 // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes
934 // when generating code for SSE2 and SSE4.1. We'll do it manually...
935 auto v = widen_cast<__m128i>(x);
936 v = _mm_slli_epi16(v,8) | _mm_srli_epi16(v,8);
937 return sk_unaligned_load<U16>(&v);
938 #else
939 return (x<<8) | (x>>8);
940 #endif
941 }
942
fract(F v)943 SI F fract(F v) { return v - floor_(v); }
944
945 // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
approx_log2(F x)946 SI F approx_log2(F x) {
947 // e - 127 is a fair approximation of log2(x) in its own right...
948 F e = cast(bit_cast<U32>(x)) * (1.0f / (1<<23));
949
950 // ... but using the mantissa to refine its error is _much_ better.
951 F m = bit_cast<F>((bit_cast<U32>(x) & 0x007fffff) | 0x3f000000);
952 return e
953 - 124.225514990f
954 - 1.498030302f * m
955 - 1.725879990f / (0.3520887068f + m);
956 }
approx_pow2(F x)957 SI F approx_pow2(F x) {
958 F f = fract(x);
959 return bit_cast<F>(round(1.0f * (1<<23),
960 x + 121.274057500f
961 - 1.490129070f * f
962 + 27.728023300f / (4.84252568f - f)));
963 }
964
approx_powf(F x,F y)965 SI F approx_powf(F x, F y) {
966 #if defined(SK_LEGACY_APPROX_POWF_SPECIALCASE)
967 return if_then_else((x == 0) , 0
968 #else
969 return if_then_else((x == 0)|(x == 1), x
970 #endif
971 , approx_pow2(approx_log2(x) * y));
972 }
973
from_half(U16 h)974 SI F from_half(U16 h) {
975 #if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \
976 && !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds.
977 return vcvt_f32_f16(h);
978
979 #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
980 return _mm256_cvtph_ps(h);
981
982 #else
983 // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias.
984 U32 sem = expand(h),
985 s = sem & 0x8000,
986 em = sem ^ s;
987
988 // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero.
989 auto denorm = (I32)em < 0x0400; // I32 comparison is often quicker, and always safe here.
990 return if_then_else(denorm, F(0)
991 , bit_cast<F>( (s<<16) + (em<<13) + ((127-15)<<23) ));
992 #endif
993 }
994
to_half(F f)995 SI U16 to_half(F f) {
996 #if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \
997 && !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds.
998 return vcvt_f16_f32(f);
999
1000 #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
1001 return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
1002
1003 #else
1004 // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias.
1005 U32 sem = bit_cast<U32>(f),
1006 s = sem & 0x80000000,
1007 em = sem ^ s;
1008
1009 // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero.
1010 auto denorm = (I32)em < 0x38800000; // I32 comparison is often quicker, and always safe here.
1011 return pack(if_then_else(denorm, U32(0)
1012 , (s>>16) + (em>>13) - ((127-15)<<10)));
1013 #endif
1014 }
1015
1016 // Our fundamental vector depth is our pixel stride.
1017 static const size_t N = sizeof(F) / sizeof(float);
1018
1019 // We're finally going to get to what a Stage function looks like!
1020 // tail == 0 ~~> work on a full N pixels
1021 // tail != 0 ~~> work on only the first tail pixels
1022 // tail is always < N.
1023
1024 // Any custom ABI to use for all (non-externally-facing) stage functions?
1025 // Also decide here whether to use narrow (compromise) or wide (ideal) stages.
1026 #if defined(SK_CPU_ARM32) && defined(JUMPER_IS_NEON)
1027 // This lets us pass vectors more efficiently on 32-bit ARM.
1028 // We can still only pass 16 floats, so best as 4x {r,g,b,a}.
1029 #define ABI __attribute__((pcs("aapcs-vfp")))
1030 #define JUMPER_NARROW_STAGES 1
1031 #elif 0 && defined(_MSC_VER) && defined(__clang__) && defined(__x86_64__)
1032 // SysV ABI makes it very sensible to use wide stages with clang-cl.
1033 // TODO: crashes during compilation :(
1034 #define ABI __attribute__((sysv_abi))
1035 #define JUMPER_NARROW_STAGES 0
1036 #elif defined(_MSC_VER)
1037 // Even if not vectorized, this lets us pass {r,g,b,a} as registers,
1038 // instead of {b,a} on the stack. Narrow stages work best for __vectorcall.
1039 #define ABI __vectorcall
1040 #define JUMPER_NARROW_STAGES 1
1041 #elif defined(__x86_64__) || defined(SK_CPU_ARM64)
1042 // These platforms are ideal for wider stages, and their default ABI is ideal.
1043 #define ABI
1044 #define JUMPER_NARROW_STAGES 0
1045 #else
1046 // 32-bit or unknown... shunt them down the narrow path.
1047 // Odds are these have few registers and are better off there.
1048 #define ABI
1049 #define JUMPER_NARROW_STAGES 1
1050 #endif
1051
1052 #if JUMPER_NARROW_STAGES
1053 struct Params {
1054 size_t dx, dy, tail;
1055 F dr,dg,db,da;
1056 };
1057 using Stage = void(ABI*)(Params*, void** program, F r, F g, F b, F a);
1058 #else
1059 // We keep program the second argument, so that it's passed in rsi for load_and_inc().
1060 using Stage = void(ABI*)(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F);
1061 #endif
1062
1063
start_pipeline(size_t dx,size_t dy,size_t xlimit,size_t ylimit,void ** program)1064 static void start_pipeline(size_t dx, size_t dy, size_t xlimit, size_t ylimit, void** program) {
1065 auto start = (Stage)load_and_inc(program);
1066 const size_t x0 = dx;
1067 for (; dy < ylimit; dy++) {
1068 #if JUMPER_NARROW_STAGES
1069 Params params = { x0,dy,0, 0,0,0,0 };
1070 while (params.dx + N <= xlimit) {
1071 start(¶ms,program, 0,0,0,0);
1072 params.dx += N;
1073 }
1074 if (size_t tail = xlimit - params.dx) {
1075 params.tail = tail;
1076 start(¶ms,program, 0,0,0,0);
1077 }
1078 #else
1079 dx = x0;
1080 while (dx + N <= xlimit) {
1081 start(0,program,dx,dy, 0,0,0,0, 0,0,0,0);
1082 dx += N;
1083 }
1084 if (size_t tail = xlimit - dx) {
1085 start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
1086 }
1087 #endif
1088 }
1089 }
1090
1091 #if JUMPER_NARROW_STAGES
1092 #define STAGE(name, ...) \
1093 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
1094 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1095 static void ABI name(Params* params, void** program, \
1096 F r, F g, F b, F a) { \
1097 name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a, \
1098 params->dr, params->dg, params->db, params->da); \
1099 auto next = (Stage)load_and_inc(program); \
1100 next(params,program, r,g,b,a); \
1101 } \
1102 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
1103 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1104 #else
1105 #define STAGE(name, ...) \
1106 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
1107 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1108 static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
1109 F r, F g, F b, F a, F dr, F dg, F db, F da) { \
1110 name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da); \
1111 auto next = (Stage)load_and_inc(program); \
1112 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
1113 } \
1114 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
1115 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1116 #endif
1117
1118
1119 // just_return() is a simple no-op stage that only exists to end the chain,
1120 // returning back up to start_pipeline(), and from there to the caller.
1121 #if JUMPER_NARROW_STAGES
just_return(Params *,void **,F,F,F,F)1122 static void ABI just_return(Params*, void**, F,F,F,F) {}
1123 #else
just_return(size_t,void **,size_t,size_t,F,F,F,F,F,F,F,F)1124 static void ABI just_return(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {}
1125 #endif
1126
1127
1128 // We could start defining normal Stages now. But first, some helper functions.
1129
1130 // These load() and store() methods are tail-aware,
1131 // but focus mainly on keeping the at-stride tail==0 case fast.
1132
1133 template <typename V, typename T>
load(const T * src,size_t tail)1134 SI V load(const T* src, size_t tail) {
1135 #if !defined(JUMPER_IS_SCALAR)
1136 __builtin_assume(tail < N);
1137 if (__builtin_expect(tail, 0)) {
1138 V v{}; // Any inactive lanes are zeroed.
1139 switch (tail) {
1140 case 7: v[6] = src[6];
1141 case 6: v[5] = src[5];
1142 case 5: v[4] = src[4];
1143 case 4: memcpy(&v, src, 4*sizeof(T)); break;
1144 case 3: v[2] = src[2];
1145 case 2: memcpy(&v, src, 2*sizeof(T)); break;
1146 case 1: memcpy(&v, src, 1*sizeof(T)); break;
1147 }
1148 return v;
1149 }
1150 #endif
1151 return sk_unaligned_load<V>(src);
1152 }
1153
1154 template <typename V, typename T>
store(T * dst,V v,size_t tail)1155 SI void store(T* dst, V v, size_t tail) {
1156 #if !defined(JUMPER_IS_SCALAR)
1157 __builtin_assume(tail < N);
1158 if (__builtin_expect(tail, 0)) {
1159 switch (tail) {
1160 case 7: dst[6] = v[6];
1161 case 6: dst[5] = v[5];
1162 case 5: dst[4] = v[4];
1163 case 4: memcpy(dst, &v, 4*sizeof(T)); break;
1164 case 3: dst[2] = v[2];
1165 case 2: memcpy(dst, &v, 2*sizeof(T)); break;
1166 case 1: memcpy(dst, &v, 1*sizeof(T)); break;
1167 }
1168 return;
1169 }
1170 #endif
1171 sk_unaligned_store(dst, v);
1172 }
1173
from_byte(U8 b)1174 SI F from_byte(U8 b) {
1175 return cast(expand(b)) * (1/255.0f);
1176 }
from_short(U16 s)1177 SI F from_short(U16 s) {
1178 return cast(expand(s)) * (1/65535.0f);
1179 }
from_565(U16 _565,F * r,F * g,F * b)1180 SI void from_565(U16 _565, F* r, F* g, F* b) {
1181 U32 wide = expand(_565);
1182 *r = cast(wide & (31<<11)) * (1.0f / (31<<11));
1183 *g = cast(wide & (63<< 5)) * (1.0f / (63<< 5));
1184 *b = cast(wide & (31<< 0)) * (1.0f / (31<< 0));
1185 }
from_4444(U16 _4444,F * r,F * g,F * b,F * a)1186 SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) {
1187 U32 wide = expand(_4444);
1188 *r = cast(wide & (15<<12)) * (1.0f / (15<<12));
1189 *g = cast(wide & (15<< 8)) * (1.0f / (15<< 8));
1190 *b = cast(wide & (15<< 4)) * (1.0f / (15<< 4));
1191 *a = cast(wide & (15<< 0)) * (1.0f / (15<< 0));
1192 }
from_8888(U32 _8888,F * r,F * g,F * b,F * a)1193 SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) {
1194 *r = cast((_8888 ) & 0xff) * (1/255.0f);
1195 *g = cast((_8888 >> 8) & 0xff) * (1/255.0f);
1196 *b = cast((_8888 >> 16) & 0xff) * (1/255.0f);
1197 *a = cast((_8888 >> 24) ) * (1/255.0f);
1198 }
from_88(U16 _88,F * r,F * g)1199 SI void from_88(U16 _88, F* r, F* g) {
1200 U32 wide = expand(_88);
1201 *r = cast((wide ) & 0xff) * (1/255.0f);
1202 *g = cast((wide >> 8) & 0xff) * (1/255.0f);
1203 }
from_1010102(U32 rgba,F * r,F * g,F * b,F * a)1204 SI void from_1010102(U32 rgba, F* r, F* g, F* b, F* a) {
1205 *r = cast((rgba ) & 0x3ff) * (1/1023.0f);
1206 *g = cast((rgba >> 10) & 0x3ff) * (1/1023.0f);
1207 *b = cast((rgba >> 20) & 0x3ff) * (1/1023.0f);
1208 *a = cast((rgba >> 30) ) * (1/ 3.0f);
1209 }
from_1616(U32 _1616,F * r,F * g)1210 SI void from_1616(U32 _1616, F* r, F* g) {
1211 *r = cast((_1616 ) & 0xffff) * (1/65535.0f);
1212 *g = cast((_1616 >> 16) & 0xffff) * (1/65535.0f);
1213 }
from_16161616(U64 _16161616,F * r,F * g,F * b,F * a)1214 SI void from_16161616(U64 _16161616, F* r, F* g, F* b, F* a) {
1215 *r = cast64((_16161616 ) & 0xffff) * (1/65535.0f);
1216 *g = cast64((_16161616 >> 16) & 0xffff) * (1/65535.0f);
1217 *b = cast64((_16161616 >> 32) & 0xffff) * (1/65535.0f);
1218 *a = cast64((_16161616 >> 48) & 0xffff) * (1/65535.0f);
1219 }
1220
1221 // Used by load_ and store_ stages to get to the right (dx,dy) starting point of contiguous memory.
1222 template <typename T>
ptr_at_xy(const SkRasterPipeline_MemoryCtx * ctx,size_t dx,size_t dy)1223 SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) {
1224 return (T*)ctx->pixels + dy*ctx->stride + dx;
1225 }
1226
1227 // clamp v to [0,limit).
clamp(F v,F limit)1228 SI F clamp(F v, F limit) {
1229 F inclusive = bit_cast<F>( bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive.
1230 return min(max(0, v), inclusive);
1231 }
1232
1233 // Used by gather_ stages to calculate the base pointer and a vector of indices to load.
1234 template <typename T>
ix_and_ptr(T ** ptr,const SkRasterPipeline_GatherCtx * ctx,F x,F y)1235 SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
1236 x = clamp(x, ctx->width);
1237 y = clamp(y, ctx->height);
1238
1239 *ptr = (const T*)ctx->pixels;
1240 return trunc_(y)*ctx->stride + trunc_(x);
1241 }
1242
1243 // We often have a nominally [0,1] float value we need to scale and convert to an integer,
1244 // whether for a table lookup or to pack back down into bytes for storage.
1245 //
1246 // In practice, especially when dealing with interesting color spaces, that notionally
1247 // [0,1] float may be out of [0,1] range. Unorms cannot represent that, so we must clamp.
1248 //
1249 // You can adjust the expected input to [0,bias] by tweaking that parameter.
1250 SI U32 to_unorm(F v, F scale, F bias = 1.0f) {
1251 // TODO: platform-specific implementations to to_unorm(), removing round() entirely?
1252 // Any time we use round() we probably want to use to_unorm().
1253 return round(min(max(0, v), bias), scale);
1254 }
1255
cond_to_mask(I32 cond)1256 SI I32 cond_to_mask(I32 cond) { return if_then_else(cond, I32(~0), I32(0)); }
1257
1258 // Now finally, normal Stages!
1259
STAGE(seed_shader,Ctx::None)1260 STAGE(seed_shader, Ctx::None) {
1261 static const float iota[] = {
1262 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
1263 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
1264 };
1265 // It's important for speed to explicitly cast(dx) and cast(dy),
1266 // which has the effect of splatting them to vectors before converting to floats.
1267 // On Intel this breaks a data dependency on previous loop iterations' registers.
1268 r = cast(dx) + sk_unaligned_load<F>(iota);
1269 g = cast(dy) + 0.5f;
1270 b = 1.0f;
1271 a = 0;
1272 dr = dg = db = da = 0;
1273 }
1274
STAGE(dither,const float * rate)1275 STAGE(dither, const float* rate) {
1276 // Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors.
1277 uint32_t iota[] = {0,1,2,3,4,5,6,7};
1278 U32 X = dx + sk_unaligned_load<U32>(iota),
1279 Y = dy;
1280
1281 // We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering.
1282 // In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ].
1283
1284 // We only need X and X^Y from here on, so it's easier to just think of that as "Y".
1285 Y ^= X;
1286
1287 // We'll mix the bottom 3 bits of each of X and Y to make 6 bits,
1288 // for 2^6 == 64 == 8x8 matrix values. If X=abc and Y=def, we make fcebda.
1289 U32 M = (Y & 1) << 5 | (X & 1) << 4
1290 | (Y & 2) << 2 | (X & 2) << 1
1291 | (Y & 4) >> 1 | (X & 4) >> 2;
1292
1293 // Scale that dither to [0,1), then (-0.5,+0.5), here using 63/128 = 0.4921875 as 0.5-epsilon.
1294 // We want to make sure our dither is less than 0.5 in either direction to keep exact values
1295 // like 0 and 1 unchanged after rounding.
1296 F dither = cast(M) * (2/128.0f) - (63/128.0f);
1297
1298 r += *rate*dither;
1299 g += *rate*dither;
1300 b += *rate*dither;
1301
1302 r = max(0, min(r, a));
1303 g = max(0, min(g, a));
1304 b = max(0, min(b, a));
1305 }
1306
1307 // load 4 floats from memory, and splat them into r,g,b,a
STAGE(uniform_color,const SkRasterPipeline_UniformColorCtx * c)1308 STAGE(uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
1309 r = c->r;
1310 g = c->g;
1311 b = c->b;
1312 a = c->a;
1313 }
STAGE(unbounded_uniform_color,const SkRasterPipeline_UniformColorCtx * c)1314 STAGE(unbounded_uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
1315 r = c->r;
1316 g = c->g;
1317 b = c->b;
1318 a = c->a;
1319 }
1320 // load 4 floats from memory, and splat them into dr,dg,db,da
STAGE(uniform_color_dst,const SkRasterPipeline_UniformColorCtx * c)1321 STAGE(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
1322 dr = c->r;
1323 dg = c->g;
1324 db = c->b;
1325 da = c->a;
1326 }
1327
1328 // splats opaque-black into r,g,b,a
STAGE(black_color,Ctx::None)1329 STAGE(black_color, Ctx::None) {
1330 r = g = b = 0.0f;
1331 a = 1.0f;
1332 }
1333
STAGE(white_color,Ctx::None)1334 STAGE(white_color, Ctx::None) {
1335 r = g = b = a = 1.0f;
1336 }
1337
1338 // load registers r,g,b,a from context (mirrors store_rgba)
STAGE(load_src,const float * ptr)1339 STAGE(load_src, const float* ptr) {
1340 r = sk_unaligned_load<F>(ptr + 0*N);
1341 g = sk_unaligned_load<F>(ptr + 1*N);
1342 b = sk_unaligned_load<F>(ptr + 2*N);
1343 a = sk_unaligned_load<F>(ptr + 3*N);
1344 }
1345
1346 // store registers r,g,b,a into context (mirrors load_rgba)
STAGE(store_src,float * ptr)1347 STAGE(store_src, float* ptr) {
1348 sk_unaligned_store(ptr + 0*N, r);
1349 sk_unaligned_store(ptr + 1*N, g);
1350 sk_unaligned_store(ptr + 2*N, b);
1351 sk_unaligned_store(ptr + 3*N, a);
1352 }
1353
1354 // load registers dr,dg,db,da from context (mirrors store_dst)
STAGE(load_dst,const float * ptr)1355 STAGE(load_dst, const float* ptr) {
1356 dr = sk_unaligned_load<F>(ptr + 0*N);
1357 dg = sk_unaligned_load<F>(ptr + 1*N);
1358 db = sk_unaligned_load<F>(ptr + 2*N);
1359 da = sk_unaligned_load<F>(ptr + 3*N);
1360 }
1361
1362 // store registers dr,dg,db,da into context (mirrors load_dst)
STAGE(store_dst,float * ptr)1363 STAGE(store_dst, float* ptr) {
1364 sk_unaligned_store(ptr + 0*N, dr);
1365 sk_unaligned_store(ptr + 1*N, dg);
1366 sk_unaligned_store(ptr + 2*N, db);
1367 sk_unaligned_store(ptr + 3*N, da);
1368 }
1369
1370 // Most blend modes apply the same logic to each channel.
1371 #define BLEND_MODE(name) \
1372 SI F name##_channel(F s, F d, F sa, F da); \
1373 STAGE(name, Ctx::None) { \
1374 r = name##_channel(r,dr,a,da); \
1375 g = name##_channel(g,dg,a,da); \
1376 b = name##_channel(b,db,a,da); \
1377 a = name##_channel(a,da,a,da); \
1378 } \
1379 SI F name##_channel(F s, F d, F sa, F da)
1380
inv(F x)1381 SI F inv(F x) { return 1.0f - x; }
two(F x)1382 SI F two(F x) { return x + x; }
1383
1384
BLEND_MODE(clear)1385 BLEND_MODE(clear) { return 0; }
BLEND_MODE(srcatop)1386 BLEND_MODE(srcatop) { return s*da + d*inv(sa); }
BLEND_MODE(dstatop)1387 BLEND_MODE(dstatop) { return d*sa + s*inv(da); }
BLEND_MODE(srcin)1388 BLEND_MODE(srcin) { return s * da; }
BLEND_MODE(dstin)1389 BLEND_MODE(dstin) { return d * sa; }
BLEND_MODE(srcout)1390 BLEND_MODE(srcout) { return s * inv(da); }
BLEND_MODE(dstout)1391 BLEND_MODE(dstout) { return d * inv(sa); }
BLEND_MODE(srcover)1392 BLEND_MODE(srcover) { return mad(d, inv(sa), s); }
BLEND_MODE(dstover)1393 BLEND_MODE(dstover) { return mad(s, inv(da), d); }
1394
BLEND_MODE(modulate)1395 BLEND_MODE(modulate) { return s*d; }
BLEND_MODE(multiply)1396 BLEND_MODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; }
BLEND_MODE(plus_)1397 BLEND_MODE(plus_) { return min(s + d, 1.0f); } // We can clamp to either 1 or sa.
BLEND_MODE(screen)1398 BLEND_MODE(screen) { return s + d - s*d; }
BLEND_MODE(xor_)1399 BLEND_MODE(xor_) { return s*inv(da) + d*inv(sa); }
1400 #undef BLEND_MODE
1401
1402 // Most other blend modes apply the same logic to colors, and srcover to alpha.
1403 #define BLEND_MODE(name) \
1404 SI F name##_channel(F s, F d, F sa, F da); \
1405 STAGE(name, Ctx::None) { \
1406 r = name##_channel(r,dr,a,da); \
1407 g = name##_channel(g,dg,a,da); \
1408 b = name##_channel(b,db,a,da); \
1409 a = mad(da, inv(a), a); \
1410 } \
1411 SI F name##_channel(F s, F d, F sa, F da)
1412
BLEND_MODE(darken)1413 BLEND_MODE(darken) { return s + d - max(s*da, d*sa) ; }
BLEND_MODE(lighten)1414 BLEND_MODE(lighten) { return s + d - min(s*da, d*sa) ; }
BLEND_MODE(difference)1415 BLEND_MODE(difference) { return s + d - two(min(s*da, d*sa)); }
BLEND_MODE(exclusion)1416 BLEND_MODE(exclusion) { return s + d - two(s*d); }
1417
BLEND_MODE(colorburn)1418 BLEND_MODE(colorburn) {
1419 return if_then_else(d == da, d + s*inv(da),
1420 if_then_else(s == 0, /* s + */ d*inv(sa),
1421 sa*(da - min(da, (da-d)*sa*rcp(s))) + s*inv(da) + d*inv(sa)));
1422 }
BLEND_MODE(colordodge)1423 BLEND_MODE(colordodge) {
1424 return if_then_else(d == 0, /* d + */ s*inv(da),
1425 if_then_else(s == sa, s + d*inv(sa),
1426 sa*min(da, (d*sa)*rcp(sa - s)) + s*inv(da) + d*inv(sa)));
1427 }
BLEND_MODE(hardlight)1428 BLEND_MODE(hardlight) {
1429 return s*inv(da) + d*inv(sa)
1430 + if_then_else(two(s) <= sa, two(s*d), sa*da - two((da-d)*(sa-s)));
1431 }
BLEND_MODE(overlay)1432 BLEND_MODE(overlay) {
1433 return s*inv(da) + d*inv(sa)
1434 + if_then_else(two(d) <= da, two(s*d), sa*da - two((da-d)*(sa-s)));
1435 }
1436
BLEND_MODE(softlight)1437 BLEND_MODE(softlight) {
1438 F m = if_then_else(da > 0, d / da, 0),
1439 s2 = two(s),
1440 m4 = two(two(m));
1441
1442 // The logic forks three ways:
1443 // 1. dark src?
1444 // 2. light src, dark dst?
1445 // 3. light src, light dst?
1446 F darkSrc = d*(sa + (s2 - sa)*(1.0f - m)), // Used in case 1.
1447 darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m, // Used in case 2.
1448 liteDst = rcp(rsqrt(m)) - m, // Used in case 3.
1449 liteSrc = d*sa + da*(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3?
1450 return s*inv(da) + d*inv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc); // 1 or (2 or 3)?
1451 }
1452 #undef BLEND_MODE
1453
1454 // We're basing our implemenation of non-separable blend modes on
1455 // https://www.w3.org/TR/compositing-1/#blendingnonseparable.
1456 // and
1457 // https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
1458 // They're equivalent, but ES' math has been better simplified.
1459 //
1460 // Anything extra we add beyond that is to make the math work with premul inputs.
1461
max(F r,F g,F b)1462 SI F max(F r, F g, F b) { return max(r, max(g, b)); }
min(F r,F g,F b)1463 SI F min(F r, F g, F b) { return min(r, min(g, b)); }
1464
sat(F r,F g,F b)1465 SI F sat(F r, F g, F b) { return max(r,g,b) - min(r,g,b); }
lum(F r,F g,F b)1466 SI F lum(F r, F g, F b) { return r*0.30f + g*0.59f + b*0.11f; }
1467
set_sat(F * r,F * g,F * b,F s)1468 SI void set_sat(F* r, F* g, F* b, F s) {
1469 F mn = min(*r,*g,*b),
1470 mx = max(*r,*g,*b),
1471 sat = mx - mn;
1472
1473 // Map min channel to 0, max channel to s, and scale the middle proportionally.
1474 auto scale = [=](F c) {
1475 return if_then_else(sat == 0, 0, (c - mn) * s / sat);
1476 };
1477 *r = scale(*r);
1478 *g = scale(*g);
1479 *b = scale(*b);
1480 }
set_lum(F * r,F * g,F * b,F l)1481 SI void set_lum(F* r, F* g, F* b, F l) {
1482 F diff = l - lum(*r, *g, *b);
1483 *r += diff;
1484 *g += diff;
1485 *b += diff;
1486 }
clip_color(F * r,F * g,F * b,F a)1487 SI void clip_color(F* r, F* g, F* b, F a) {
1488 F mn = min(*r, *g, *b),
1489 mx = max(*r, *g, *b),
1490 l = lum(*r, *g, *b);
1491
1492 auto clip = [=](F c) {
1493 c = if_then_else(mn >= 0, c, l + (c - l) * ( l) / (l - mn) );
1494 c = if_then_else(mx > a, l + (c - l) * (a - l) / (mx - l), c);
1495 c = max(c, 0); // Sometimes without this we may dip just a little negative.
1496 return c;
1497 };
1498 *r = clip(*r);
1499 *g = clip(*g);
1500 *b = clip(*b);
1501 }
1502
STAGE(hue,Ctx::None)1503 STAGE(hue, Ctx::None) {
1504 F R = r*a,
1505 G = g*a,
1506 B = b*a;
1507
1508 set_sat(&R, &G, &B, sat(dr,dg,db)*a);
1509 set_lum(&R, &G, &B, lum(dr,dg,db)*a);
1510 clip_color(&R,&G,&B, a*da);
1511
1512 r = r*inv(da) + dr*inv(a) + R;
1513 g = g*inv(da) + dg*inv(a) + G;
1514 b = b*inv(da) + db*inv(a) + B;
1515 a = a + da - a*da;
1516 }
STAGE(saturation,Ctx::None)1517 STAGE(saturation, Ctx::None) {
1518 F R = dr*a,
1519 G = dg*a,
1520 B = db*a;
1521
1522 set_sat(&R, &G, &B, sat( r, g, b)*da);
1523 set_lum(&R, &G, &B, lum(dr,dg,db)* a); // (This is not redundant.)
1524 clip_color(&R,&G,&B, a*da);
1525
1526 r = r*inv(da) + dr*inv(a) + R;
1527 g = g*inv(da) + dg*inv(a) + G;
1528 b = b*inv(da) + db*inv(a) + B;
1529 a = a + da - a*da;
1530 }
STAGE(color,Ctx::None)1531 STAGE(color, Ctx::None) {
1532 F R = r*da,
1533 G = g*da,
1534 B = b*da;
1535
1536 set_lum(&R, &G, &B, lum(dr,dg,db)*a);
1537 clip_color(&R,&G,&B, a*da);
1538
1539 r = r*inv(da) + dr*inv(a) + R;
1540 g = g*inv(da) + dg*inv(a) + G;
1541 b = b*inv(da) + db*inv(a) + B;
1542 a = a + da - a*da;
1543 }
STAGE(luminosity,Ctx::None)1544 STAGE(luminosity, Ctx::None) {
1545 F R = dr*a,
1546 G = dg*a,
1547 B = db*a;
1548
1549 set_lum(&R, &G, &B, lum(r,g,b)*da);
1550 clip_color(&R,&G,&B, a*da);
1551
1552 r = r*inv(da) + dr*inv(a) + R;
1553 g = g*inv(da) + dg*inv(a) + G;
1554 b = b*inv(da) + db*inv(a) + B;
1555 a = a + da - a*da;
1556 }
1557
STAGE(srcover_rgba_8888,const SkRasterPipeline_MemoryCtx * ctx)1558 STAGE(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
1559 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
1560
1561 U32 dst = load<U32>(ptr, tail);
1562 dr = cast((dst ) & 0xff);
1563 dg = cast((dst >> 8) & 0xff);
1564 db = cast((dst >> 16) & 0xff);
1565 da = cast((dst >> 24) );
1566 // {dr,dg,db,da} are in [0,255]
1567 // { r, g, b, a} are in [0, 1] (but may be out of gamut)
1568
1569 r = mad(dr, inv(a), r*255.0f);
1570 g = mad(dg, inv(a), g*255.0f);
1571 b = mad(db, inv(a), b*255.0f);
1572 a = mad(da, inv(a), a*255.0f);
1573 // { r, g, b, a} are now in [0,255] (but may be out of gamut)
1574
1575 // to_unorm() clamps back to gamut. Scaling by 1 since we're already 255-biased.
1576 dst = to_unorm(r, 1, 255)
1577 | to_unorm(g, 1, 255) << 8
1578 | to_unorm(b, 1, 255) << 16
1579 | to_unorm(a, 1, 255) << 24;
1580 store(ptr, dst, tail);
1581 }
1582
STAGE(clamp_0,Ctx::None)1583 STAGE(clamp_0, Ctx::None) {
1584 r = max(r, 0);
1585 g = max(g, 0);
1586 b = max(b, 0);
1587 a = max(a, 0);
1588 }
1589
STAGE(clamp_1,Ctx::None)1590 STAGE(clamp_1, Ctx::None) {
1591 r = min(r, 1.0f);
1592 g = min(g, 1.0f);
1593 b = min(b, 1.0f);
1594 a = min(a, 1.0f);
1595 }
1596
STAGE(clamp_a,Ctx::None)1597 STAGE(clamp_a, Ctx::None) {
1598 a = min(a, 1.0f);
1599 r = min(r, a);
1600 g = min(g, a);
1601 b = min(b, a);
1602 }
1603
STAGE(clamp_gamut,Ctx::None)1604 STAGE(clamp_gamut, Ctx::None) {
1605 // If you're using this stage, a should already be in [0,1].
1606 r = min(max(r, 0), a);
1607 g = min(max(g, 0), a);
1608 b = min(max(b, 0), a);
1609 }
1610
STAGE(set_rgb,const float * rgb)1611 STAGE(set_rgb, const float* rgb) {
1612 r = rgb[0];
1613 g = rgb[1];
1614 b = rgb[2];
1615 }
STAGE(unbounded_set_rgb,const float * rgb)1616 STAGE(unbounded_set_rgb, const float* rgb) {
1617 r = rgb[0];
1618 g = rgb[1];
1619 b = rgb[2];
1620 }
1621
STAGE(swap_rb,Ctx::None)1622 STAGE(swap_rb, Ctx::None) {
1623 auto tmp = r;
1624 r = b;
1625 b = tmp;
1626 }
STAGE(swap_rb_dst,Ctx::None)1627 STAGE(swap_rb_dst, Ctx::None) {
1628 auto tmp = dr;
1629 dr = db;
1630 db = tmp;
1631 }
1632
STAGE(move_src_dst,Ctx::None)1633 STAGE(move_src_dst, Ctx::None) {
1634 dr = r;
1635 dg = g;
1636 db = b;
1637 da = a;
1638 }
STAGE(move_dst_src,Ctx::None)1639 STAGE(move_dst_src, Ctx::None) {
1640 r = dr;
1641 g = dg;
1642 b = db;
1643 a = da;
1644 }
1645
STAGE(premul,Ctx::None)1646 STAGE(premul, Ctx::None) {
1647 r = r * a;
1648 g = g * a;
1649 b = b * a;
1650 }
STAGE(premul_dst,Ctx::None)1651 STAGE(premul_dst, Ctx::None) {
1652 dr = dr * da;
1653 dg = dg * da;
1654 db = db * da;
1655 }
STAGE(unpremul,Ctx::None)1656 STAGE(unpremul, Ctx::None) {
1657 float inf = bit_cast<float>(0x7f800000);
1658 auto scale = if_then_else(1.0f/a < inf, 1.0f/a, 0);
1659 r *= scale;
1660 g *= scale;
1661 b *= scale;
1662 }
1663
STAGE(force_opaque,Ctx::None)1664 STAGE(force_opaque , Ctx::None) { a = 1; }
STAGE(force_opaque_dst,Ctx::None)1665 STAGE(force_opaque_dst, Ctx::None) { da = 1; }
1666
STAGE(rgb_to_hsl,Ctx::None)1667 STAGE(rgb_to_hsl, Ctx::None) {
1668 F mx = max(r,g,b),
1669 mn = min(r,g,b),
1670 d = mx - mn,
1671 d_rcp = 1.0f / d;
1672
1673 F h = (1/6.0f) *
1674 if_then_else(mx == mn, 0,
1675 if_then_else(mx == r, (g-b)*d_rcp + if_then_else(g < b, 6.0f, 0),
1676 if_then_else(mx == g, (b-r)*d_rcp + 2.0f,
1677 (r-g)*d_rcp + 4.0f)));
1678
1679 F l = (mx + mn) * 0.5f;
1680 F s = if_then_else(mx == mn, 0,
1681 d / if_then_else(l > 0.5f, 2.0f-mx-mn, mx+mn));
1682
1683 r = h;
1684 g = s;
1685 b = l;
1686 }
STAGE(hsl_to_rgb,Ctx::None)1687 STAGE(hsl_to_rgb, Ctx::None) {
1688 F h = r,
1689 s = g,
1690 l = b;
1691
1692 F q = l + if_then_else(l >= 0.5f, s - l*s, l*s),
1693 p = 2.0f*l - q;
1694
1695 auto hue_to_rgb = [&](F t) {
1696 t = fract(t);
1697
1698 F r = p;
1699 r = if_then_else(t >= 4/6.0f, r, p + (q-p)*(4.0f - 6.0f*t));
1700 r = if_then_else(t >= 3/6.0f, r, q);
1701 r = if_then_else(t >= 1/6.0f, r, p + (q-p)*( 6.0f*t));
1702 return r;
1703 };
1704
1705 r = if_then_else(s == 0, l, hue_to_rgb(h + (1/3.0f)));
1706 g = if_then_else(s == 0, l, hue_to_rgb(h ));
1707 b = if_then_else(s == 0, l, hue_to_rgb(h - (1/3.0f)));
1708 }
1709
1710 // Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
alpha_coverage_from_rgb_coverage(F a,F da,F cr,F cg,F cb)1711 SI F alpha_coverage_from_rgb_coverage(F a, F da, F cr, F cg, F cb) {
1712 return if_then_else(a < da, min(cr,cg,cb)
1713 , max(cr,cg,cb));
1714 }
1715
STAGE(scale_1_float,const float * c)1716 STAGE(scale_1_float, const float* c) {
1717 r = r * *c;
1718 g = g * *c;
1719 b = b * *c;
1720 a = a * *c;
1721 }
STAGE(scale_u8,const SkRasterPipeline_MemoryCtx * ctx)1722 STAGE(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) {
1723 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1724
1725 auto scales = load<U8>(ptr, tail);
1726 auto c = from_byte(scales);
1727
1728 r = r * c;
1729 g = g * c;
1730 b = b * c;
1731 a = a * c;
1732 }
STAGE(scale_565,const SkRasterPipeline_MemoryCtx * ctx)1733 STAGE(scale_565, const SkRasterPipeline_MemoryCtx* ctx) {
1734 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1735
1736 F cr,cg,cb;
1737 from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
1738
1739 F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
1740
1741 r = r * cr;
1742 g = g * cg;
1743 b = b * cb;
1744 a = a * ca;
1745 }
1746
lerp(F from,F to,F t)1747 SI F lerp(F from, F to, F t) {
1748 return mad(to-from, t, from);
1749 }
1750
STAGE(lerp_1_float,const float * c)1751 STAGE(lerp_1_float, const float* c) {
1752 r = lerp(dr, r, *c);
1753 g = lerp(dg, g, *c);
1754 b = lerp(db, b, *c);
1755 a = lerp(da, a, *c);
1756 }
STAGE(lerp_native,const float scales[])1757 STAGE(lerp_native, const float scales[]) {
1758 auto c = sk_unaligned_load<F>(scales);
1759 r = lerp(dr, r, c);
1760 g = lerp(dg, g, c);
1761 b = lerp(db, b, c);
1762 a = lerp(da, a, c);
1763 }
STAGE(lerp_u8,const SkRasterPipeline_MemoryCtx * ctx)1764 STAGE(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) {
1765 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1766
1767 auto scales = load<U8>(ptr, tail);
1768 auto c = from_byte(scales);
1769
1770 r = lerp(dr, r, c);
1771 g = lerp(dg, g, c);
1772 b = lerp(db, b, c);
1773 a = lerp(da, a, c);
1774 }
STAGE(lerp_565,const SkRasterPipeline_MemoryCtx * ctx)1775 STAGE(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) {
1776 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1777
1778 F cr,cg,cb;
1779 from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
1780
1781 F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
1782
1783 r = lerp(dr, r, cr);
1784 g = lerp(dg, g, cg);
1785 b = lerp(db, b, cb);
1786 a = lerp(da, a, ca);
1787 }
1788
STAGE(emboss,const SkRasterPipeline_EmbossCtx * ctx)1789 STAGE(emboss, const SkRasterPipeline_EmbossCtx* ctx) {
1790 auto mptr = ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy),
1791 aptr = ptr_at_xy<const uint8_t>(&ctx->add, dx,dy);
1792
1793 F mul = from_byte(load<U8>(mptr, tail)),
1794 add = from_byte(load<U8>(aptr, tail));
1795
1796 r = mad(r, mul, add);
1797 g = mad(g, mul, add);
1798 b = mad(b, mul, add);
1799 }
1800
STAGE(byte_tables,const void * ctx)1801 STAGE(byte_tables, const void* ctx) { // TODO: rename Tables SkRasterPipeline_ByteTablesCtx
1802 struct Tables { const uint8_t *r, *g, *b, *a; };
1803 auto tables = (const Tables*)ctx;
1804
1805 r = from_byte(gather(tables->r, to_unorm(r, 255)));
1806 g = from_byte(gather(tables->g, to_unorm(g, 255)));
1807 b = from_byte(gather(tables->b, to_unorm(b, 255)));
1808 a = from_byte(gather(tables->a, to_unorm(a, 255)));
1809 }
1810
strip_sign(F x,U32 * sign)1811 SI F strip_sign(F x, U32* sign) {
1812 U32 bits = bit_cast<U32>(x);
1813 *sign = bits & 0x80000000;
1814 return bit_cast<F>(bits ^ *sign);
1815 }
1816
apply_sign(F x,U32 sign)1817 SI F apply_sign(F x, U32 sign) {
1818 return bit_cast<F>(sign | bit_cast<U32>(x));
1819 }
1820
STAGE(parametric,const skcms_TransferFunction * ctx)1821 STAGE(parametric, const skcms_TransferFunction* ctx) {
1822 auto fn = [&](F v) {
1823 U32 sign;
1824 v = strip_sign(v, &sign);
1825
1826 F r = if_then_else(v <= ctx->d, mad(ctx->c, v, ctx->f)
1827 , approx_powf(mad(ctx->a, v, ctx->b), ctx->g) + ctx->e);
1828 return apply_sign(r, sign);
1829 };
1830 r = fn(r);
1831 g = fn(g);
1832 b = fn(b);
1833 }
1834
STAGE(gamma_,const float * G)1835 STAGE(gamma_, const float* G) {
1836 auto fn = [&](F v) {
1837 U32 sign;
1838 v = strip_sign(v, &sign);
1839 return apply_sign(approx_powf(v, *G), sign);
1840 };
1841 r = fn(r);
1842 g = fn(g);
1843 b = fn(b);
1844 }
1845
STAGE(from_srgb,Ctx::None)1846 STAGE(from_srgb, Ctx::None) {
1847 auto fn = [](F s) {
1848 U32 sign;
1849 s = strip_sign(s, &sign);
1850 auto lo = s * (1/12.92f);
1851 auto hi = mad(s*s, mad(s, 0.3000f, 0.6975f), 0.0025f);
1852 return apply_sign(if_then_else(s < 0.055f, lo, hi), sign);
1853 };
1854 r = fn(r);
1855 g = fn(g);
1856 b = fn(b);
1857 }
STAGE(to_srgb,Ctx::None)1858 STAGE(to_srgb, Ctx::None) {
1859 auto fn = [](F l) {
1860 U32 sign;
1861 l = strip_sign(l, &sign);
1862 // We tweak c and d for each instruction set to make sure fn(1) is exactly 1.
1863 #if defined(JUMPER_IS_AVX512)
1864 const float c = 1.130026340485f,
1865 d = 0.141387879848f;
1866 #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || \
1867 defined(JUMPER_IS_AVX ) || defined(JUMPER_IS_HSW )
1868 const float c = 1.130048394203f,
1869 d = 0.141357362270f;
1870 #elif defined(JUMPER_IS_NEON)
1871 const float c = 1.129999995232f,
1872 d = 0.141381442547f;
1873 #else
1874 const float c = 1.129999995232f,
1875 d = 0.141377761960f;
1876 #endif
1877 F t = rsqrt(l);
1878 auto lo = l * 12.92f;
1879 auto hi = mad(t, mad(t, -0.0024542345f, 0.013832027f), c)
1880 * rcp(d + t);
1881 return apply_sign(if_then_else(l < 0.00465985f, lo, hi), sign);
1882 };
1883 r = fn(r);
1884 g = fn(g);
1885 b = fn(b);
1886 }
1887
STAGE(load_a8,const SkRasterPipeline_MemoryCtx * ctx)1888 STAGE(load_a8, const SkRasterPipeline_MemoryCtx* ctx) {
1889 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1890
1891 r = g = b = 0.0f;
1892 a = from_byte(load<U8>(ptr, tail));
1893 }
STAGE(load_a8_dst,const SkRasterPipeline_MemoryCtx * ctx)1894 STAGE(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) {
1895 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1896
1897 dr = dg = db = 0.0f;
1898 da = from_byte(load<U8>(ptr, tail));
1899 }
STAGE(gather_a8,const SkRasterPipeline_GatherCtx * ctx)1900 STAGE(gather_a8, const SkRasterPipeline_GatherCtx* ctx) {
1901 const uint8_t* ptr;
1902 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
1903 r = g = b = 0.0f;
1904 a = from_byte(gather(ptr, ix));
1905 }
STAGE(store_a8,const SkRasterPipeline_MemoryCtx * ctx)1906 STAGE(store_a8, const SkRasterPipeline_MemoryCtx* ctx) {
1907 auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);
1908
1909 U8 packed = pack(pack(to_unorm(a, 255)));
1910 store(ptr, packed, tail);
1911 }
1912
STAGE(load_565,const SkRasterPipeline_MemoryCtx * ctx)1913 STAGE(load_565, const SkRasterPipeline_MemoryCtx* ctx) {
1914 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1915
1916 from_565(load<U16>(ptr, tail), &r,&g,&b);
1917 a = 1.0f;
1918 }
STAGE(load_565_dst,const SkRasterPipeline_MemoryCtx * ctx)1919 STAGE(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) {
1920 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1921
1922 from_565(load<U16>(ptr, tail), &dr,&dg,&db);
1923 da = 1.0f;
1924 }
STAGE(gather_565,const SkRasterPipeline_GatherCtx * ctx)1925 STAGE(gather_565, const SkRasterPipeline_GatherCtx* ctx) {
1926 const uint16_t* ptr;
1927 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
1928 from_565(gather(ptr, ix), &r,&g,&b);
1929 a = 1.0f;
1930 }
STAGE(store_565,const SkRasterPipeline_MemoryCtx * ctx)1931 STAGE(store_565, const SkRasterPipeline_MemoryCtx* ctx) {
1932 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
1933
1934 U16 px = pack( to_unorm(r, 31) << 11
1935 | to_unorm(g, 63) << 5
1936 | to_unorm(b, 31) );
1937 store(ptr, px, tail);
1938 }
1939
STAGE(load_4444,const SkRasterPipeline_MemoryCtx * ctx)1940 STAGE(load_4444, const SkRasterPipeline_MemoryCtx* ctx) {
1941 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1942 from_4444(load<U16>(ptr, tail), &r,&g,&b,&a);
1943 }
STAGE(load_4444_dst,const SkRasterPipeline_MemoryCtx * ctx)1944 STAGE(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) {
1945 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1946 from_4444(load<U16>(ptr, tail), &dr,&dg,&db,&da);
1947 }
STAGE(gather_4444,const SkRasterPipeline_GatherCtx * ctx)1948 STAGE(gather_4444, const SkRasterPipeline_GatherCtx* ctx) {
1949 const uint16_t* ptr;
1950 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
1951 from_4444(gather(ptr, ix), &r,&g,&b,&a);
1952 }
STAGE(store_4444,const SkRasterPipeline_MemoryCtx * ctx)1953 STAGE(store_4444, const SkRasterPipeline_MemoryCtx* ctx) {
1954 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
1955 U16 px = pack( to_unorm(r, 15) << 12
1956 | to_unorm(g, 15) << 8
1957 | to_unorm(b, 15) << 4
1958 | to_unorm(a, 15) );
1959 store(ptr, px, tail);
1960 }
1961
STAGE(load_8888,const SkRasterPipeline_MemoryCtx * ctx)1962 STAGE(load_8888, const SkRasterPipeline_MemoryCtx* ctx) {
1963 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
1964 from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
1965 }
STAGE(load_8888_dst,const SkRasterPipeline_MemoryCtx * ctx)1966 STAGE(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) {
1967 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
1968 from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da);
1969 }
STAGE(gather_8888,const SkRasterPipeline_GatherCtx * ctx)1970 STAGE(gather_8888, const SkRasterPipeline_GatherCtx* ctx) {
1971 const uint32_t* ptr;
1972 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
1973 from_8888(gather(ptr, ix), &r,&g,&b,&a);
1974 }
STAGE(store_8888,const SkRasterPipeline_MemoryCtx * ctx)1975 STAGE(store_8888, const SkRasterPipeline_MemoryCtx* ctx) {
1976 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
1977
1978 U32 px = to_unorm(r, 255)
1979 | to_unorm(g, 255) << 8
1980 | to_unorm(b, 255) << 16
1981 | to_unorm(a, 255) << 24;
1982 store(ptr, px, tail);
1983 }
1984
STAGE(load_rg88,const SkRasterPipeline_MemoryCtx * ctx)1985 STAGE(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
1986 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1987 b = 0;
1988 a = 1;
1989 from_88(load<U16>(ptr, tail), &r,&g);
1990 }
STAGE(store_rg88,const SkRasterPipeline_MemoryCtx * ctx)1991 STAGE(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
1992 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
1993
1994 U16 px = pack( to_unorm(r, 255)
1995 | to_unorm(g, 255) << 8);
1996 store(ptr, px, tail);
1997 }
1998
STAGE(load_a16,const SkRasterPipeline_MemoryCtx * ctx)1999 STAGE(load_a16, const SkRasterPipeline_MemoryCtx* ctx) {
2000 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2001 r = g = b = 0;
2002 a = from_short(load<U16>(ptr, tail));
2003 }
STAGE(store_a16,const SkRasterPipeline_MemoryCtx * ctx)2004 STAGE(store_a16, const SkRasterPipeline_MemoryCtx* ctx) {
2005 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2006
2007 U16 px = pack(to_unorm(a, 65535));
2008 store(ptr, px, tail);
2009 }
STAGE(load_rg1616,const SkRasterPipeline_MemoryCtx * ctx)2010 STAGE(load_rg1616, const SkRasterPipeline_MemoryCtx* ctx) {
2011 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2012 b = 0; a = 1;
2013 from_1616(load<U32>(ptr, tail), &r,&g);
2014 }
STAGE(store_rg1616,const SkRasterPipeline_MemoryCtx * ctx)2015 STAGE(store_rg1616, const SkRasterPipeline_MemoryCtx* ctx) {
2016 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2017
2018 U32 px = to_unorm(r, 65535)
2019 | to_unorm(g, 65535) << 16;
2020 store(ptr, px, tail);
2021 }
STAGE(load_16161616,const SkRasterPipeline_MemoryCtx * ctx)2022 STAGE(load_16161616, const SkRasterPipeline_MemoryCtx* ctx) {
2023 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
2024 from_16161616(load<U64>(ptr, tail), &r,&g, &b, &a);
2025 }
STAGE(store_16161616,const SkRasterPipeline_MemoryCtx * ctx)2026 STAGE(store_16161616, const SkRasterPipeline_MemoryCtx* ctx) {
2027 auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy);
2028
2029 U16 R = pack(to_unorm(r, 65535)),
2030 G = pack(to_unorm(g, 65535)),
2031 B = pack(to_unorm(b, 65535)),
2032 A = pack(to_unorm(a, 65535));
2033
2034 store4(ptr,tail, R,G,B,A);
2035 }
2036
2037
STAGE(load_1010102,const SkRasterPipeline_MemoryCtx * ctx)2038 STAGE(load_1010102, const SkRasterPipeline_MemoryCtx* ctx) {
2039 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2040 from_1010102(load<U32>(ptr, tail), &r,&g,&b,&a);
2041 }
STAGE(load_1010102_dst,const SkRasterPipeline_MemoryCtx * ctx)2042 STAGE(load_1010102_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2043 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2044 from_1010102(load<U32>(ptr, tail), &dr,&dg,&db,&da);
2045 }
STAGE(gather_1010102,const SkRasterPipeline_GatherCtx * ctx)2046 STAGE(gather_1010102, const SkRasterPipeline_GatherCtx* ctx) {
2047 const uint32_t* ptr;
2048 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2049 from_1010102(gather(ptr, ix), &r,&g,&b,&a);
2050 }
STAGE(store_1010102,const SkRasterPipeline_MemoryCtx * ctx)2051 STAGE(store_1010102, const SkRasterPipeline_MemoryCtx* ctx) {
2052 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2053
2054 U32 px = to_unorm(r, 1023)
2055 | to_unorm(g, 1023) << 10
2056 | to_unorm(b, 1023) << 20
2057 | to_unorm(a, 3) << 30;
2058 store(ptr, px, tail);
2059 }
2060
STAGE(load_f16,const SkRasterPipeline_MemoryCtx * ctx)2061 STAGE(load_f16, const SkRasterPipeline_MemoryCtx* ctx) {
2062 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
2063
2064 U16 R,G,B,A;
2065 load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
2066 r = from_half(R);
2067 g = from_half(G);
2068 b = from_half(B);
2069 a = from_half(A);
2070 }
STAGE(load_f16_dst,const SkRasterPipeline_MemoryCtx * ctx)2071 STAGE(load_f16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2072 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
2073
2074 U16 R,G,B,A;
2075 load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
2076 dr = from_half(R);
2077 dg = from_half(G);
2078 db = from_half(B);
2079 da = from_half(A);
2080 }
STAGE(gather_f16,const SkRasterPipeline_GatherCtx * ctx)2081 STAGE(gather_f16, const SkRasterPipeline_GatherCtx* ctx) {
2082 const uint64_t* ptr;
2083 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2084 auto px = gather(ptr, ix);
2085
2086 U16 R,G,B,A;
2087 load4((const uint16_t*)&px,0, &R,&G,&B,&A);
2088 r = from_half(R);
2089 g = from_half(G);
2090 b = from_half(B);
2091 a = from_half(A);
2092 }
STAGE(store_f16,const SkRasterPipeline_MemoryCtx * ctx)2093 STAGE(store_f16, const SkRasterPipeline_MemoryCtx* ctx) {
2094 auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy);
2095 store4((uint16_t*)ptr,tail, to_half(r)
2096 , to_half(g)
2097 , to_half(b)
2098 , to_half(a));
2099 }
2100
STAGE(store_u16_be,const SkRasterPipeline_MemoryCtx * ctx)2101 STAGE(store_u16_be, const SkRasterPipeline_MemoryCtx* ctx) {
2102 auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,dy);
2103
2104 U16 R = bswap(pack(to_unorm(r, 65535))),
2105 G = bswap(pack(to_unorm(g, 65535))),
2106 B = bswap(pack(to_unorm(b, 65535))),
2107 A = bswap(pack(to_unorm(a, 65535)));
2108
2109 store4(ptr,tail, R,G,B,A);
2110 }
2111
STAGE(load_af16,const SkRasterPipeline_MemoryCtx * ctx)2112 STAGE(load_af16, const SkRasterPipeline_MemoryCtx* ctx) {
2113 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2114
2115 U16 A = load<U16>((const uint16_t*)ptr, tail);
2116 r = 0;
2117 g = 0;
2118 b = 0;
2119 a = from_half(A);
2120 }
STAGE(store_af16,const SkRasterPipeline_MemoryCtx * ctx)2121 STAGE(store_af16, const SkRasterPipeline_MemoryCtx* ctx) {
2122 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2123 store(ptr, to_half(a), tail);
2124 }
2125
STAGE(load_rgf16,const SkRasterPipeline_MemoryCtx * ctx)2126 STAGE(load_rgf16, const SkRasterPipeline_MemoryCtx* ctx) {
2127 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2128
2129 U16 R,G;
2130 load2((const uint16_t*)ptr,tail, &R,&G);
2131 r = from_half(R);
2132 g = from_half(G);
2133 b = 0;
2134 a = from_half(0x3C00); // one
2135 }
STAGE(store_rgf16,const SkRasterPipeline_MemoryCtx * ctx)2136 STAGE(store_rgf16, const SkRasterPipeline_MemoryCtx* ctx) {
2137 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2138 store2((uint16_t*)ptr, tail, to_half(r)
2139 , to_half(g));
2140 }
2141
STAGE(load_f32,const SkRasterPipeline_MemoryCtx * ctx)2142 STAGE(load_f32, const SkRasterPipeline_MemoryCtx* ctx) {
2143 auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy);
2144 load4(ptr,tail, &r,&g,&b,&a);
2145 }
STAGE(load_f32_dst,const SkRasterPipeline_MemoryCtx * ctx)2146 STAGE(load_f32_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2147 auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy);
2148 load4(ptr,tail, &dr,&dg,&db,&da);
2149 }
STAGE(gather_f32,const SkRasterPipeline_GatherCtx * ctx)2150 STAGE(gather_f32, const SkRasterPipeline_GatherCtx* ctx) {
2151 const float* ptr;
2152 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2153 r = gather(ptr, 4*ix + 0);
2154 g = gather(ptr, 4*ix + 1);
2155 b = gather(ptr, 4*ix + 2);
2156 a = gather(ptr, 4*ix + 3);
2157 }
STAGE(store_f32,const SkRasterPipeline_MemoryCtx * ctx)2158 STAGE(store_f32, const SkRasterPipeline_MemoryCtx* ctx) {
2159 auto ptr = ptr_at_xy<float>(ctx, 4*dx,4*dy);
2160 store4(ptr,tail, r,g,b,a);
2161 }
2162
STAGE(load_rgf32,const SkRasterPipeline_MemoryCtx * ctx)2163 STAGE(load_rgf32, const SkRasterPipeline_MemoryCtx* ctx) {
2164 auto ptr = ptr_at_xy<const float>(ctx, 2*dx,2*dy);
2165 load2(ptr, tail, &r, &g);
2166 b = 0;
2167 a = 1;
2168 }
STAGE(store_rgf32,const SkRasterPipeline_MemoryCtx * ctx)2169 STAGE(store_rgf32, const SkRasterPipeline_MemoryCtx* ctx) {
2170 auto ptr = ptr_at_xy<float>(ctx, 2*dx,2*dy);
2171 store2(ptr, tail, r, g);
2172 }
2173
exclusive_repeat(F v,const SkRasterPipeline_TileCtx * ctx)2174 SI F exclusive_repeat(F v, const SkRasterPipeline_TileCtx* ctx) {
2175 return v - floor_(v*ctx->invScale)*ctx->scale;
2176 }
exclusive_mirror(F v,const SkRasterPipeline_TileCtx * ctx)2177 SI F exclusive_mirror(F v, const SkRasterPipeline_TileCtx* ctx) {
2178 auto limit = ctx->scale;
2179 auto invLimit = ctx->invScale;
2180 return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit );
2181 }
2182 // Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images).
2183 // The gather stages will hard clamp the output of these stages to [0,limit)...
2184 // we just need to do the basic repeat or mirroring.
STAGE(repeat_x,const SkRasterPipeline_TileCtx * ctx)2185 STAGE(repeat_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_repeat(r, ctx); }
STAGE(repeat_y,const SkRasterPipeline_TileCtx * ctx)2186 STAGE(repeat_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_repeat(g, ctx); }
STAGE(mirror_x,const SkRasterPipeline_TileCtx * ctx)2187 STAGE(mirror_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_mirror(r, ctx); }
STAGE(mirror_y,const SkRasterPipeline_TileCtx * ctx)2188 STAGE(mirror_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_mirror(g, ctx); }
2189
2190 // Clamp x to [0,1], both sides inclusive (think, gradients).
2191 // Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
clamp_01(F v)2192 SI F clamp_01(F v) { return min(max(0, v), 1); }
2193
STAGE(clamp_x_1,Ctx::None)2194 STAGE( clamp_x_1, Ctx::None) { r = clamp_01(r); }
STAGE(repeat_x_1,Ctx::None)2195 STAGE(repeat_x_1, Ctx::None) { r = clamp_01(r - floor_(r)); }
STAGE(mirror_x_1,Ctx::None)2196 STAGE(mirror_x_1, Ctx::None) { r = clamp_01(abs_( (r-1.0f) - two(floor_((r-1.0f)*0.5f)) - 1.0f )); }
2197
2198 // Decal stores a 32bit mask after checking the coordinate (x and/or y) against its domain:
2199 // mask == 0x00000000 if the coordinate(s) are out of bounds
2200 // mask == 0xFFFFFFFF if the coordinate(s) are in bounds
2201 // After the gather stage, the r,g,b,a values are AND'd with this mask, setting them to 0
2202 // if either of the coordinates were out of bounds.
2203
STAGE(decal_x,SkRasterPipeline_DecalTileCtx * ctx)2204 STAGE(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
2205 auto w = ctx->limit_x;
2206 sk_unaligned_store(ctx->mask, cond_to_mask((0 <= r) & (r < w)));
2207 }
STAGE(decal_y,SkRasterPipeline_DecalTileCtx * ctx)2208 STAGE(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
2209 auto h = ctx->limit_y;
2210 sk_unaligned_store(ctx->mask, cond_to_mask((0 <= g) & (g < h)));
2211 }
STAGE(decal_x_and_y,SkRasterPipeline_DecalTileCtx * ctx)2212 STAGE(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
2213 auto w = ctx->limit_x;
2214 auto h = ctx->limit_y;
2215 sk_unaligned_store(ctx->mask,
2216 cond_to_mask((0 <= r) & (r < w) & (0 <= g) & (g < h)));
2217 }
STAGE(check_decal_mask,SkRasterPipeline_DecalTileCtx * ctx)2218 STAGE(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
2219 auto mask = sk_unaligned_load<U32>(ctx->mask);
2220 r = bit_cast<F>( bit_cast<U32>(r) & mask );
2221 g = bit_cast<F>( bit_cast<U32>(g) & mask );
2222 b = bit_cast<F>( bit_cast<U32>(b) & mask );
2223 a = bit_cast<F>( bit_cast<U32>(a) & mask );
2224 }
2225
STAGE(alpha_to_gray,Ctx::None)2226 STAGE(alpha_to_gray, Ctx::None) {
2227 r = g = b = a;
2228 a = 1;
2229 }
STAGE(alpha_to_gray_dst,Ctx::None)2230 STAGE(alpha_to_gray_dst, Ctx::None) {
2231 dr = dg = db = da;
2232 da = 1;
2233 }
STAGE(bt709_luminance_or_luma_to_alpha,Ctx::None)2234 STAGE(bt709_luminance_or_luma_to_alpha, Ctx::None) {
2235 a = r*0.2126f + g*0.7152f + b*0.0722f;
2236 r = g = b = 0;
2237 }
2238
STAGE(matrix_translate,const float * m)2239 STAGE(matrix_translate, const float* m) {
2240 r += m[0];
2241 g += m[1];
2242 }
STAGE(matrix_scale_translate,const float * m)2243 STAGE(matrix_scale_translate, const float* m) {
2244 r = mad(r,m[0], m[2]);
2245 g = mad(g,m[1], m[3]);
2246 }
STAGE(matrix_2x3,const float * m)2247 STAGE(matrix_2x3, const float* m) {
2248 auto R = mad(r,m[0], mad(g,m[2], m[4])),
2249 G = mad(r,m[1], mad(g,m[3], m[5]));
2250 r = R;
2251 g = G;
2252 }
STAGE(matrix_3x3,const float * m)2253 STAGE(matrix_3x3, const float* m) {
2254 auto R = mad(r,m[0], mad(g,m[3], b*m[6])),
2255 G = mad(r,m[1], mad(g,m[4], b*m[7])),
2256 B = mad(r,m[2], mad(g,m[5], b*m[8]));
2257 r = R;
2258 g = G;
2259 b = B;
2260 }
STAGE(matrix_3x4,const float * m)2261 STAGE(matrix_3x4, const float* m) {
2262 auto R = mad(r,m[0], mad(g,m[3], mad(b,m[6], m[ 9]))),
2263 G = mad(r,m[1], mad(g,m[4], mad(b,m[7], m[10]))),
2264 B = mad(r,m[2], mad(g,m[5], mad(b,m[8], m[11])));
2265 r = R;
2266 g = G;
2267 b = B;
2268 }
STAGE(matrix_4x5,const float * m)2269 STAGE(matrix_4x5, const float* m) {
2270 auto R = mad(r,m[ 0], mad(g,m[ 1], mad(b,m[ 2], mad(a,m[ 3], m[ 4])))),
2271 G = mad(r,m[ 5], mad(g,m[ 6], mad(b,m[ 7], mad(a,m[ 8], m[ 9])))),
2272 B = mad(r,m[10], mad(g,m[11], mad(b,m[12], mad(a,m[13], m[14])))),
2273 A = mad(r,m[15], mad(g,m[16], mad(b,m[17], mad(a,m[18], m[19]))));
2274 r = R;
2275 g = G;
2276 b = B;
2277 a = A;
2278 }
STAGE(matrix_4x3,const float * m)2279 STAGE(matrix_4x3, const float* m) {
2280 auto X = r,
2281 Y = g;
2282
2283 r = mad(X, m[0], mad(Y, m[4], m[ 8]));
2284 g = mad(X, m[1], mad(Y, m[5], m[ 9]));
2285 b = mad(X, m[2], mad(Y, m[6], m[10]));
2286 a = mad(X, m[3], mad(Y, m[7], m[11]));
2287 }
STAGE(matrix_perspective,const float * m)2288 STAGE(matrix_perspective, const float* m) {
2289 // N.B. Unlike the other matrix_ stages, this matrix is row-major.
2290 auto R = mad(r,m[0], mad(g,m[1], m[2])),
2291 G = mad(r,m[3], mad(g,m[4], m[5])),
2292 Z = mad(r,m[6], mad(g,m[7], m[8]));
2293 r = R * rcp(Z);
2294 g = G * rcp(Z);
2295 }
2296
gradient_lookup(const SkRasterPipeline_GradientCtx * c,U32 idx,F t,F * r,F * g,F * b,F * a)2297 SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t,
2298 F* r, F* g, F* b, F* a) {
2299 F fr, br, fg, bg, fb, bb, fa, ba;
2300 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
2301 if (c->stopCount <=8) {
2302 fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), idx);
2303 br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), idx);
2304 fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), idx);
2305 bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), idx);
2306 fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), idx);
2307 bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), idx);
2308 fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), idx);
2309 ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), idx);
2310 } else
2311 #endif
2312 {
2313 fr = gather(c->fs[0], idx);
2314 br = gather(c->bs[0], idx);
2315 fg = gather(c->fs[1], idx);
2316 bg = gather(c->bs[1], idx);
2317 fb = gather(c->fs[2], idx);
2318 bb = gather(c->bs[2], idx);
2319 fa = gather(c->fs[3], idx);
2320 ba = gather(c->bs[3], idx);
2321 }
2322
2323 *r = mad(t, fr, br);
2324 *g = mad(t, fg, bg);
2325 *b = mad(t, fb, bb);
2326 *a = mad(t, fa, ba);
2327 }
2328
STAGE(evenly_spaced_gradient,const SkRasterPipeline_GradientCtx * c)2329 STAGE(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) {
2330 auto t = r;
2331 auto idx = trunc_(t * (c->stopCount-1));
2332 gradient_lookup(c, idx, t, &r, &g, &b, &a);
2333 }
2334
STAGE(gradient,const SkRasterPipeline_GradientCtx * c)2335 STAGE(gradient, const SkRasterPipeline_GradientCtx* c) {
2336 auto t = r;
2337 U32 idx = 0;
2338
2339 // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
2340 for (size_t i = 1; i < c->stopCount; i++) {
2341 idx += if_then_else(t >= c->ts[i], U32(1), U32(0));
2342 }
2343
2344 gradient_lookup(c, idx, t, &r, &g, &b, &a);
2345 }
2346
STAGE(evenly_spaced_2_stop_gradient,const void * ctx)2347 STAGE(evenly_spaced_2_stop_gradient, const void* ctx) {
2348 // TODO: Rename Ctx SkRasterPipeline_EvenlySpaced2StopGradientCtx.
2349 struct Ctx { float f[4], b[4]; };
2350 auto c = (const Ctx*)ctx;
2351
2352 auto t = r;
2353 r = mad(t, c->f[0], c->b[0]);
2354 g = mad(t, c->f[1], c->b[1]);
2355 b = mad(t, c->f[2], c->b[2]);
2356 a = mad(t, c->f[3], c->b[3]);
2357 }
2358
STAGE(xy_to_unit_angle,Ctx::None)2359 STAGE(xy_to_unit_angle, Ctx::None) {
2360 F X = r,
2361 Y = g;
2362 F xabs = abs_(X),
2363 yabs = abs_(Y);
2364
2365 F slope = min(xabs, yabs)/max(xabs, yabs);
2366 F s = slope * slope;
2367
2368 // Use a 7th degree polynomial to approximate atan.
2369 // This was generated using sollya.gforge.inria.fr.
2370 // A float optimized polynomial was generated using the following command.
2371 // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
2372 F phi = slope
2373 * (0.15912117063999176025390625f + s
2374 * (-5.185396969318389892578125e-2f + s
2375 * (2.476101927459239959716796875e-2f + s
2376 * (-7.0547382347285747528076171875e-3f))));
2377
2378 phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi);
2379 phi = if_then_else(X < 0.0f , 1.0f/2.0f - phi, phi);
2380 phi = if_then_else(Y < 0.0f , 1.0f - phi , phi);
2381 phi = if_then_else(phi != phi , 0 , phi); // Check for NaN.
2382 r = phi;
2383 }
2384
STAGE(xy_to_radius,Ctx::None)2385 STAGE(xy_to_radius, Ctx::None) {
2386 F X2 = r * r,
2387 Y2 = g * g;
2388 r = sqrt_(X2 + Y2);
2389 }
2390
2391 // Please see https://skia.org/dev/design/conical for how our 2pt conical shader works.
2392
STAGE(negate_x,Ctx::None)2393 STAGE(negate_x, Ctx::None) { r = -r; }
2394
STAGE(xy_to_2pt_conical_strip,const SkRasterPipeline_2PtConicalCtx * ctx)2395 STAGE(xy_to_2pt_conical_strip, const SkRasterPipeline_2PtConicalCtx* ctx) {
2396 F x = r, y = g, &t = r;
2397 t = x + sqrt_(ctx->fP0 - y*y); // ctx->fP0 = r0 * r0
2398 }
2399
STAGE(xy_to_2pt_conical_focal_on_circle,Ctx::None)2400 STAGE(xy_to_2pt_conical_focal_on_circle, Ctx::None) {
2401 F x = r, y = g, &t = r;
2402 t = x + y*y / x; // (x^2 + y^2) / x
2403 }
2404
STAGE(xy_to_2pt_conical_well_behaved,const SkRasterPipeline_2PtConicalCtx * ctx)2405 STAGE(xy_to_2pt_conical_well_behaved, const SkRasterPipeline_2PtConicalCtx* ctx) {
2406 F x = r, y = g, &t = r;
2407 t = sqrt_(x*x + y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1
2408 }
2409
STAGE(xy_to_2pt_conical_greater,const SkRasterPipeline_2PtConicalCtx * ctx)2410 STAGE(xy_to_2pt_conical_greater, const SkRasterPipeline_2PtConicalCtx* ctx) {
2411 F x = r, y = g, &t = r;
2412 t = sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1
2413 }
2414
STAGE(xy_to_2pt_conical_smaller,const SkRasterPipeline_2PtConicalCtx * ctx)2415 STAGE(xy_to_2pt_conical_smaller, const SkRasterPipeline_2PtConicalCtx* ctx) {
2416 F x = r, y = g, &t = r;
2417 t = -sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1
2418 }
2419
STAGE(alter_2pt_conical_compensate_focal,const SkRasterPipeline_2PtConicalCtx * ctx)2420 STAGE(alter_2pt_conical_compensate_focal, const SkRasterPipeline_2PtConicalCtx* ctx) {
2421 F& t = r;
2422 t = t + ctx->fP1; // ctx->fP1 = f
2423 }
2424
STAGE(alter_2pt_conical_unswap,Ctx::None)2425 STAGE(alter_2pt_conical_unswap, Ctx::None) {
2426 F& t = r;
2427 t = 1 - t;
2428 }
2429
STAGE(mask_2pt_conical_nan,SkRasterPipeline_2PtConicalCtx * c)2430 STAGE(mask_2pt_conical_nan, SkRasterPipeline_2PtConicalCtx* c) {
2431 F& t = r;
2432 auto is_degenerate = (t != t); // NaN
2433 t = if_then_else(is_degenerate, F(0), t);
2434 sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
2435 }
2436
STAGE(mask_2pt_conical_degenerates,SkRasterPipeline_2PtConicalCtx * c)2437 STAGE(mask_2pt_conical_degenerates, SkRasterPipeline_2PtConicalCtx* c) {
2438 F& t = r;
2439 auto is_degenerate = (t <= 0) | (t != t);
2440 t = if_then_else(is_degenerate, F(0), t);
2441 sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
2442 }
2443
STAGE(apply_vector_mask,const uint32_t * ctx)2444 STAGE(apply_vector_mask, const uint32_t* ctx) {
2445 const U32 mask = sk_unaligned_load<U32>(ctx);
2446 r = bit_cast<F>(bit_cast<U32>(r) & mask);
2447 g = bit_cast<F>(bit_cast<U32>(g) & mask);
2448 b = bit_cast<F>(bit_cast<U32>(b) & mask);
2449 a = bit_cast<F>(bit_cast<U32>(a) & mask);
2450 }
2451
STAGE(save_xy,SkRasterPipeline_SamplerCtx * c)2452 STAGE(save_xy, SkRasterPipeline_SamplerCtx* c) {
2453 // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy).
2454 // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
2455 // surrounding (x,y) at (0.5,0.5) off-center.
2456 F fx = fract(r + 0.5f),
2457 fy = fract(g + 0.5f);
2458
2459 // Samplers will need to load x and fx, or y and fy.
2460 sk_unaligned_store(c->x, r);
2461 sk_unaligned_store(c->y, g);
2462 sk_unaligned_store(c->fx, fx);
2463 sk_unaligned_store(c->fy, fy);
2464 }
2465
STAGE(accumulate,const SkRasterPipeline_SamplerCtx * c)2466 STAGE(accumulate, const SkRasterPipeline_SamplerCtx* c) {
2467 // Bilinear and bicubic filters are both separable, so we produce independent contributions
2468 // from x and y, multiplying them together here to get each pixel's total scale factor.
2469 auto scale = sk_unaligned_load<F>(c->scalex)
2470 * sk_unaligned_load<F>(c->scaley);
2471 dr = mad(scale, r, dr);
2472 dg = mad(scale, g, dg);
2473 db = mad(scale, b, db);
2474 da = mad(scale, a, da);
2475 }
2476
2477 // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
2478 // are combined in direct proportion to their area overlapping that logical query pixel.
2479 // At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x.
2480 // The y-axis is symmetric.
2481
2482 template <int kScale>
bilinear_x(SkRasterPipeline_SamplerCtx * ctx,F * x)2483 SI void bilinear_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
2484 *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f);
2485 F fx = sk_unaligned_load<F>(ctx->fx);
2486
2487 F scalex;
2488 if (kScale == -1) { scalex = 1.0f - fx; }
2489 if (kScale == +1) { scalex = fx; }
2490 sk_unaligned_store(ctx->scalex, scalex);
2491 }
2492 template <int kScale>
bilinear_y(SkRasterPipeline_SamplerCtx * ctx,F * y)2493 SI void bilinear_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
2494 *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f);
2495 F fy = sk_unaligned_load<F>(ctx->fy);
2496
2497 F scaley;
2498 if (kScale == -1) { scaley = 1.0f - fy; }
2499 if (kScale == +1) { scaley = fy; }
2500 sk_unaligned_store(ctx->scaley, scaley);
2501 }
2502
STAGE(bilinear_nx,SkRasterPipeline_SamplerCtx * ctx)2503 STAGE(bilinear_nx, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<-1>(ctx, &r); }
STAGE(bilinear_px,SkRasterPipeline_SamplerCtx * ctx)2504 STAGE(bilinear_px, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<+1>(ctx, &r); }
STAGE(bilinear_ny,SkRasterPipeline_SamplerCtx * ctx)2505 STAGE(bilinear_ny, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<-1>(ctx, &g); }
STAGE(bilinear_py,SkRasterPipeline_SamplerCtx * ctx)2506 STAGE(bilinear_py, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<+1>(ctx, &g); }
2507
2508
2509 // In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample
2510 // pixel center are combined with a non-uniform cubic filter, with higher values near the center.
2511 //
2512 // We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets.
2513 // See GrCubicEffect for details of this particular filter.
2514
bicubic_near(F t)2515 SI F bicubic_near(F t) {
2516 // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18
2517 return mad(t, mad(t, mad((-21/18.0f), t, (27/18.0f)), (9/18.0f)), (1/18.0f));
2518 }
bicubic_far(F t)2519 SI F bicubic_far(F t) {
2520 // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18)
2521 return (t*t)*mad((7/18.0f), t, (-6/18.0f));
2522 }
2523
2524 template <int kScale>
bicubic_x(SkRasterPipeline_SamplerCtx * ctx,F * x)2525 SI void bicubic_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
2526 *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f);
2527 F fx = sk_unaligned_load<F>(ctx->fx);
2528
2529 F scalex;
2530 if (kScale == -3) { scalex = bicubic_far (1.0f - fx); }
2531 if (kScale == -1) { scalex = bicubic_near(1.0f - fx); }
2532 if (kScale == +1) { scalex = bicubic_near( fx); }
2533 if (kScale == +3) { scalex = bicubic_far ( fx); }
2534 sk_unaligned_store(ctx->scalex, scalex);
2535 }
2536 template <int kScale>
bicubic_y(SkRasterPipeline_SamplerCtx * ctx,F * y)2537 SI void bicubic_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
2538 *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f);
2539 F fy = sk_unaligned_load<F>(ctx->fy);
2540
2541 F scaley;
2542 if (kScale == -3) { scaley = bicubic_far (1.0f - fy); }
2543 if (kScale == -1) { scaley = bicubic_near(1.0f - fy); }
2544 if (kScale == +1) { scaley = bicubic_near( fy); }
2545 if (kScale == +3) { scaley = bicubic_far ( fy); }
2546 sk_unaligned_store(ctx->scaley, scaley);
2547 }
2548
STAGE(bicubic_n3x,SkRasterPipeline_SamplerCtx * ctx)2549 STAGE(bicubic_n3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-3>(ctx, &r); }
STAGE(bicubic_n1x,SkRasterPipeline_SamplerCtx * ctx)2550 STAGE(bicubic_n1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-1>(ctx, &r); }
STAGE(bicubic_p1x,SkRasterPipeline_SamplerCtx * ctx)2551 STAGE(bicubic_p1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+1>(ctx, &r); }
STAGE(bicubic_p3x,SkRasterPipeline_SamplerCtx * ctx)2552 STAGE(bicubic_p3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+3>(ctx, &r); }
2553
STAGE(bicubic_n3y,SkRasterPipeline_SamplerCtx * ctx)2554 STAGE(bicubic_n3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-3>(ctx, &g); }
STAGE(bicubic_n1y,SkRasterPipeline_SamplerCtx * ctx)2555 STAGE(bicubic_n1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-1>(ctx, &g); }
STAGE(bicubic_p1y,SkRasterPipeline_SamplerCtx * ctx)2556 STAGE(bicubic_p1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+1>(ctx, &g); }
STAGE(bicubic_p3y,SkRasterPipeline_SamplerCtx * ctx)2557 STAGE(bicubic_p3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+3>(ctx, &g); }
2558
STAGE(callback,SkRasterPipeline_CallbackCtx * c)2559 STAGE(callback, SkRasterPipeline_CallbackCtx* c) {
2560 store4(c->rgba,0, r,g,b,a);
2561 c->fn(c, tail ? tail : N);
2562 load4(c->read_from,0, &r,&g,&b,&a);
2563 }
2564
2565 // shader: void main(float x, float y, inout half4 color)
2566 // colorfilter: void main(inout half4 color)
STAGE(interpreter,SkRasterPipeline_InterpreterCtx * c)2567 STAGE(interpreter, SkRasterPipeline_InterpreterCtx* c) {
2568 // If N is less than the interpreter's VecWidth, then we are doing more work than necessary in
2569 // the interpreter. This is a known issue, and will be addressed at some point.
2570 float xx[N], yy[N],
2571 rr[N], gg[N], bb[N], aa[N];
2572
2573 float* args[] = { xx, yy, rr, gg, bb, aa };
2574 float** in_args = args;
2575 int in_count = 6;
2576
2577 if (c->shaderConvention) {
2578 // our caller must have called seed_shader to set these
2579 sk_unaligned_store(xx, r);
2580 sk_unaligned_store(yy, g);
2581 sk_unaligned_store(rr, F(c->paintColor.fR));
2582 sk_unaligned_store(gg, F(c->paintColor.fG));
2583 sk_unaligned_store(bb, F(c->paintColor.fB));
2584 sk_unaligned_store(aa, F(c->paintColor.fA));
2585 } else {
2586 in_args += 2; // skip x,y
2587 in_count = 4;
2588 sk_unaligned_store(rr, r);
2589 sk_unaligned_store(gg, g);
2590 sk_unaligned_store(bb, b);
2591 sk_unaligned_store(aa, a);
2592 }
2593
2594 SkAssertResult(c->byteCode->runStriped(c->fn, in_args, in_count, tail ? tail : N,
2595 (const float*)c->inputs, c->ninputs, nullptr, 0));
2596
2597 r = sk_unaligned_load<F>(rr);
2598 g = sk_unaligned_load<F>(gg);
2599 b = sk_unaligned_load<F>(bb);
2600 a = sk_unaligned_load<F>(aa);
2601 }
2602
STAGE(gauss_a_to_rgba,Ctx::None)2603 STAGE(gauss_a_to_rgba, Ctx::None) {
2604 // x = 1 - x;
2605 // exp(-x * x * 4) - 0.018f;
2606 // ... now approximate with quartic
2607 //
2608 const float c4 = -2.26661229133605957031f;
2609 const float c3 = 2.89795351028442382812f;
2610 const float c2 = 0.21345567703247070312f;
2611 const float c1 = 0.15489584207534790039f;
2612 const float c0 = 0.00030726194381713867f;
2613 a = mad(a, mad(a, mad(a, mad(a, c4, c3), c2), c1), c0);
2614 r = a;
2615 g = a;
2616 b = a;
2617 }
2618
tile(F v,SkTileMode mode,float limit,float invLimit)2619 SI F tile(F v, SkTileMode mode, float limit, float invLimit) {
2620 // The ix_and_ptr() calls in sample() will clamp tile()'s output, so no need to clamp here.
2621 switch (mode) {
2622 case SkTileMode::kDecal: // TODO, for now fallthrough to clamp
2623 case SkTileMode::kClamp: return v;
2624 case SkTileMode::kRepeat: return v - floor_(v*invLimit)*limit;
2625 case SkTileMode::kMirror:
2626 return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit );
2627 }
2628 SkUNREACHABLE;
2629 }
2630
sample(const SkRasterPipeline_SamplerCtx2 * ctx,F x,F y,F * r,F * g,F * b,F * a)2631 SI void sample(const SkRasterPipeline_SamplerCtx2* ctx, F x, F y,
2632 F* r, F* g, F* b, F* a) {
2633 x = tile(x, ctx->tileX, ctx->width , ctx->invWidth );
2634 y = tile(y, ctx->tileY, ctx->height, ctx->invHeight);
2635
2636 switch (ctx->ct) {
2637 default: *r = *g = *b = *a = 0; // TODO
2638 break;
2639
2640 case kRGBA_8888_SkColorType:
2641 case kBGRA_8888_SkColorType: {
2642 const uint32_t* ptr;
2643 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
2644 from_8888(gather(ptr, ix), r,g,b,a);
2645 if (ctx->ct == kBGRA_8888_SkColorType) {
2646 std::swap(*r,*b);
2647 }
2648 } break;
2649 }
2650 }
2651
2652 template <int D>
sampler(const SkRasterPipeline_SamplerCtx2 * ctx,F cx,F cy,const F (& wx)[D],const F (& wy)[D],F * r,F * g,F * b,F * a)2653 SI void sampler(const SkRasterPipeline_SamplerCtx2* ctx,
2654 F cx, F cy, const F (&wx)[D], const F (&wy)[D],
2655 F* r, F* g, F* b, F* a) {
2656
2657 float start = -0.5f*(D-1);
2658
2659 *r = *g = *b = *a = 0;
2660 F y = cy + start;
2661 for (int j = 0; j < D; j++, y += 1.0f) {
2662 F x = cx + start;
2663 for (int i = 0; i < D; i++, x += 1.0f) {
2664 F R,G,B,A;
2665 sample(ctx, x,y, &R,&G,&B,&A);
2666
2667 F w = wx[i] * wy[j];
2668 *r = mad(w,R,*r);
2669 *g = mad(w,G,*g);
2670 *b = mad(w,B,*b);
2671 *a = mad(w,A,*a);
2672 }
2673 }
2674 }
2675
STAGE(bilinear,const SkRasterPipeline_SamplerCtx2 * ctx)2676 STAGE(bilinear, const SkRasterPipeline_SamplerCtx2* ctx) {
2677 F x = r, fx = fract(x + 0.5f),
2678 y = g, fy = fract(y + 0.5f);
2679 const F wx[] = {1.0f - fx, fx};
2680 const F wy[] = {1.0f - fy, fy};
2681
2682 sampler(ctx, x,y, wx,wy, &r,&g,&b,&a);
2683 }
STAGE(bicubic,SkRasterPipeline_SamplerCtx2 * ctx)2684 STAGE(bicubic, SkRasterPipeline_SamplerCtx2* ctx) {
2685 F x = r, fx = fract(x + 0.5f),
2686 y = g, fy = fract(y + 0.5f);
2687 const F wx[] = { bicubic_far(1-fx), bicubic_near(1-fx), bicubic_near(fx), bicubic_far(fx) };
2688 const F wy[] = { bicubic_far(1-fy), bicubic_near(1-fy), bicubic_near(fy), bicubic_far(fy) };
2689
2690 sampler(ctx, x,y, wx,wy, &r,&g,&b,&a);
2691 }
2692
2693 // A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
STAGE(bilerp_clamp_8888,const SkRasterPipeline_GatherCtx * ctx)2694 STAGE(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
2695 // (cx,cy) are the center of our sample.
2696 F cx = r,
2697 cy = g;
2698
2699 // All sample points are at the same fractional offset (fx,fy).
2700 // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
2701 F fx = fract(cx + 0.5f),
2702 fy = fract(cy + 0.5f);
2703
2704 // We'll accumulate the color of all four samples into {r,g,b,a} directly.
2705 r = g = b = a = 0;
2706
2707 for (float dy = -0.5f; dy <= +0.5f; dy += 1.0f)
2708 for (float dx = -0.5f; dx <= +0.5f; dx += 1.0f) {
2709 // (x,y) are the coordinates of this sample point.
2710 F x = cx + dx,
2711 y = cy + dy;
2712
2713 // ix_and_ptr() will clamp to the image's bounds for us.
2714 const uint32_t* ptr;
2715 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
2716
2717 F sr,sg,sb,sa;
2718 from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
2719
2720 // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
2721 // are combined in direct proportion to their area overlapping that logical query pixel.
2722 // At positive offsets, the x-axis contribution to that rectangle is fx,
2723 // or (1-fx) at negative x. Same deal for y.
2724 F sx = (dx > 0) ? fx : 1.0f - fx,
2725 sy = (dy > 0) ? fy : 1.0f - fy,
2726 area = sx * sy;
2727
2728 r += sr * area;
2729 g += sg * area;
2730 b += sb * area;
2731 a += sa * area;
2732 }
2733 }
2734
2735 // A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
STAGE(bicubic_clamp_8888,const SkRasterPipeline_GatherCtx * ctx)2736 STAGE(bicubic_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
2737 // (cx,cy) are the center of our sample.
2738 F cx = r,
2739 cy = g;
2740
2741 // All sample points are at the same fractional offset (fx,fy).
2742 // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
2743 F fx = fract(cx + 0.5f),
2744 fy = fract(cy + 0.5f);
2745
2746 // We'll accumulate the color of all four samples into {r,g,b,a} directly.
2747 r = g = b = a = 0;
2748
2749 const F scaley[4] = {
2750 bicubic_far (1.0f - fy), bicubic_near(1.0f - fy),
2751 bicubic_near( fy), bicubic_far ( fy),
2752 };
2753 const F scalex[4] = {
2754 bicubic_far (1.0f - fx), bicubic_near(1.0f - fx),
2755 bicubic_near( fx), bicubic_far ( fx),
2756 };
2757
2758 F sample_y = cy - 1.5f;
2759 for (int yy = 0; yy <= 3; ++yy) {
2760 F sample_x = cx - 1.5f;
2761 for (int xx = 0; xx <= 3; ++xx) {
2762 F scale = scalex[xx] * scaley[yy];
2763
2764 // ix_and_ptr() will clamp to the image's bounds for us.
2765 const uint32_t* ptr;
2766 U32 ix = ix_and_ptr(&ptr, ctx, sample_x, sample_y);
2767
2768 F sr,sg,sb,sa;
2769 from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
2770
2771 r = mad(scale, sr, r);
2772 g = mad(scale, sg, g);
2773 b = mad(scale, sb, b);
2774 a = mad(scale, sa, a);
2775
2776 sample_x += 1;
2777 }
2778 sample_y += 1;
2779 }
2780 }
2781
2782 // ~~~~~~ GrSwizzle stage ~~~~~~ //
2783
STAGE(swizzle,void * ctx)2784 STAGE(swizzle, void* ctx) {
2785 auto ir = r, ig = g, ib = b, ia = a;
2786 F* o[] = {&r, &g, &b, &a};
2787 char swiz[4];
2788 memcpy(swiz, &ctx, sizeof(swiz));
2789
2790 for (int i = 0; i < 4; ++i) {
2791 switch (swiz[i]) {
2792 case 'r': *o[i] = ir; break;
2793 case 'g': *o[i] = ig; break;
2794 case 'b': *o[i] = ib; break;
2795 case 'a': *o[i] = ia; break;
2796 case '0': *o[i] = F(0); break;
2797 case '1': *o[i] = F(1); break;
2798 default: break;
2799 }
2800 }
2801 }
2802
2803 namespace lowp {
2804 #if defined(JUMPER_IS_SCALAR) || defined(SK_DISABLE_LOWP_RASTER_PIPELINE)
2805 // If we're not compiled by Clang, or otherwise switched into scalar mode (old Clang, manually),
2806 // we don't generate lowp stages. All these nullptrs will tell SkJumper.cpp to always use the
2807 // highp float pipeline.
2808 #define M(st) static void (*st)(void) = nullptr;
2809 SK_RASTER_PIPELINE_STAGES(M)
2810 #undef M
2811 static void (*just_return)(void) = nullptr;
2812
start_pipeline(size_t,size_t,size_t,size_t,void **)2813 static void start_pipeline(size_t,size_t,size_t,size_t, void**) {}
2814
2815 #else // We are compiling vector code with Clang... let's make some lowp stages!
2816
2817 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
2818 using U8 = uint8_t __attribute__((ext_vector_type(16)));
2819 using U16 = uint16_t __attribute__((ext_vector_type(16)));
2820 using I16 = int16_t __attribute__((ext_vector_type(16)));
2821 using I32 = int32_t __attribute__((ext_vector_type(16)));
2822 using U32 = uint32_t __attribute__((ext_vector_type(16)));
2823 using F = float __attribute__((ext_vector_type(16)));
2824 #else
2825 using U8 = uint8_t __attribute__((ext_vector_type(8)));
2826 using U16 = uint16_t __attribute__((ext_vector_type(8)));
2827 using I16 = int16_t __attribute__((ext_vector_type(8)));
2828 using I32 = int32_t __attribute__((ext_vector_type(8)));
2829 using U32 = uint32_t __attribute__((ext_vector_type(8)));
2830 using F = float __attribute__((ext_vector_type(8)));
2831 #endif
2832
2833 static const size_t N = sizeof(U16) / sizeof(uint16_t);
2834
2835 // Once again, some platforms benefit from a restricted Stage calling convention,
2836 // but others can pass tons and tons of registers and we're happy to exploit that.
2837 // It's exactly the same decision and implementation strategy as the F stages above.
2838 #if JUMPER_NARROW_STAGES
2839 struct Params {
2840 size_t dx, dy, tail;
2841 U16 dr,dg,db,da;
2842 };
2843 using Stage = void(ABI*)(Params*, void** program, U16 r, U16 g, U16 b, U16 a);
2844 #else
2845 // We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64.
2846 using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
2847 U16 r, U16 g, U16 b, U16 a,
2848 U16 dr, U16 dg, U16 db, U16 da);
2849 #endif
2850
2851 static void start_pipeline(const size_t x0, const size_t y0,
2852 const size_t xlimit, const size_t ylimit, void** program) {
2853 auto start = (Stage)load_and_inc(program);
2854 for (size_t dy = y0; dy < ylimit; dy++) {
2855 #if JUMPER_NARROW_STAGES
2856 Params params = { x0,dy,0, 0,0,0,0 };
2857 for (; params.dx + N <= xlimit; params.dx += N) {
2858 start(¶ms,program, 0,0,0,0);
2859 }
2860 if (size_t tail = xlimit - params.dx) {
2861 params.tail = tail;
2862 start(¶ms,program, 0,0,0,0);
2863 }
2864 #else
2865 size_t dx = x0;
2866 for (; dx + N <= xlimit; dx += N) {
2867 start( 0,program,dx,dy, 0,0,0,0, 0,0,0,0);
2868 }
2869 if (size_t tail = xlimit - dx) {
2870 start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
2871 }
2872 #endif
2873 }
2874 }
2875
2876 #if JUMPER_NARROW_STAGES
2877 static void ABI just_return(Params*, void**, U16,U16,U16,U16) {}
2878 #else
2879 static void ABI just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {}
2880 #endif
2881
2882 // All stages use the same function call ABI to chain into each other, but there are three types:
2883 // GG: geometry in, geometry out -- think, a matrix
2884 // GP: geometry in, pixels out. -- think, a memory gather
2885 // PP: pixels in, pixels out. -- think, a blend mode
2886 //
2887 // (Some stages ignore their inputs or produce no logical output. That's perfectly fine.)
2888 //
2889 // These three STAGE_ macros let you define each type of stage,
2890 // and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
2891
2892 #if JUMPER_NARROW_STAGES
2893 #define STAGE_GG(name, ...) \
2894 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
2895 static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
2896 auto x = join<F>(r,g), \
2897 y = join<F>(b,a); \
2898 name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y); \
2899 split(x, &r,&g); \
2900 split(y, &b,&a); \
2901 auto next = (Stage)load_and_inc(program); \
2902 next(params,program, r,g,b,a); \
2903 } \
2904 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
2905
2906 #define STAGE_GP(name, ...) \
2907 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
2908 U16& r, U16& g, U16& b, U16& a, \
2909 U16& dr, U16& dg, U16& db, U16& da); \
2910 static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
2911 auto x = join<F>(r,g), \
2912 y = join<F>(b,a); \
2913 name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a, \
2914 params->dr,params->dg,params->db,params->da); \
2915 auto next = (Stage)load_and_inc(program); \
2916 next(params,program, r,g,b,a); \
2917 } \
2918 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
2919 U16& r, U16& g, U16& b, U16& a, \
2920 U16& dr, U16& dg, U16& db, U16& da)
2921
2922 #define STAGE_PP(name, ...) \
2923 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
2924 U16& r, U16& g, U16& b, U16& a, \
2925 U16& dr, U16& dg, U16& db, U16& da); \
2926 static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
2927 name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a, \
2928 params->dr,params->dg,params->db,params->da); \
2929 auto next = (Stage)load_and_inc(program); \
2930 next(params,program, r,g,b,a); \
2931 } \
2932 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
2933 U16& r, U16& g, U16& b, U16& a, \
2934 U16& dr, U16& dg, U16& db, U16& da)
2935 #else
2936 #define STAGE_GG(name, ...) \
2937 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
2938 static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
2939 U16 r, U16 g, U16 b, U16 a, \
2940 U16 dr, U16 dg, U16 db, U16 da) { \
2941 auto x = join<F>(r,g), \
2942 y = join<F>(b,a); \
2943 name##_k(Ctx{program}, dx,dy,tail, x,y); \
2944 split(x, &r,&g); \
2945 split(y, &b,&a); \
2946 auto next = (Stage)load_and_inc(program); \
2947 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
2948 } \
2949 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
2950
2951 #define STAGE_GP(name, ...) \
2952 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
2953 U16& r, U16& g, U16& b, U16& a, \
2954 U16& dr, U16& dg, U16& db, U16& da); \
2955 static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
2956 U16 r, U16 g, U16 b, U16 a, \
2957 U16 dr, U16 dg, U16 db, U16 da) { \
2958 auto x = join<F>(r,g), \
2959 y = join<F>(b,a); \
2960 name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \
2961 auto next = (Stage)load_and_inc(program); \
2962 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
2963 } \
2964 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
2965 U16& r, U16& g, U16& b, U16& a, \
2966 U16& dr, U16& dg, U16& db, U16& da)
2967
2968 #define STAGE_PP(name, ...) \
2969 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
2970 U16& r, U16& g, U16& b, U16& a, \
2971 U16& dr, U16& dg, U16& db, U16& da); \
2972 static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
2973 U16 r, U16 g, U16 b, U16 a, \
2974 U16 dr, U16 dg, U16 db, U16 da) { \
2975 name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \
2976 auto next = (Stage)load_and_inc(program); \
2977 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
2978 } \
2979 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
2980 U16& r, U16& g, U16& b, U16& a, \
2981 U16& dr, U16& dg, U16& db, U16& da)
2982 #endif
2983
2984 // ~~~~~~ Commonly used helper functions ~~~~~~ //
2985
2986 SI U16 div255(U16 v) {
2987 #if 0
2988 return (v+127)/255; // The ideal rounding divide by 255.
2989 #elif 1 && defined(JUMPER_IS_NEON)
2990 // With NEON we can compute (v+127)/255 as (v + ((v+128)>>8) + 128)>>8
2991 // just as fast as we can do the approximation below, so might as well be correct!
2992 // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up.
2993 return vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8);
2994 #else
2995 return (v+255)/256; // A good approximation of (v+127)/255.
2996 #endif
2997 }
2998
2999 SI U16 inv(U16 v) { return 255-v; }
3000
3001 SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); }
3002 SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) | (e & ~c); }
3003
3004 SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); }
3005 SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); }
3006 SI U16 max(U16 x, U16 y, U16 z) { return max(x, max(y, z)); }
3007 SI U16 min(U16 x, U16 y, U16 z) { return min(x, min(y, z)); }
3008
3009 SI U16 from_float(float f) { return f * 255.0f + 0.5f; }
3010
3011 SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); }
3012
3013 template <typename D, typename S>
3014 SI D cast(S src) {
3015 return __builtin_convertvector(src, D);
3016 }
3017
3018 template <typename D, typename S>
3019 SI void split(S v, D* lo, D* hi) {
3020 static_assert(2*sizeof(D) == sizeof(S), "");
3021 memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D));
3022 memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D));
3023 }
3024 template <typename D, typename S>
3025 SI D join(S lo, S hi) {
3026 static_assert(sizeof(D) == 2*sizeof(S), "");
3027 D v;
3028 memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S));
3029 memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S));
3030 return v;
3031 }
3032
3033 SI F if_then_else(I32 c, F t, F e) {
3034 return bit_cast<F>( (bit_cast<I32>(t) & c) | (bit_cast<I32>(e) & ~c) );
3035 }
3036 SI F max(F x, F y) { return if_then_else(x < y, y, x); }
3037 SI F min(F x, F y) { return if_then_else(x < y, x, y); }
3038
3039 SI F mad(F f, F m, F a) { return f*m+a; }
3040 SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
3041
3042 SI F rcp(F x) {
3043 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
3044 __m256 lo,hi;
3045 split(x, &lo,&hi);
3046 return join<F>(_mm256_rcp_ps(lo), _mm256_rcp_ps(hi));
3047 #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
3048 __m128 lo,hi;
3049 split(x, &lo,&hi);
3050 return join<F>(_mm_rcp_ps(lo), _mm_rcp_ps(hi));
3051 #elif defined(JUMPER_IS_NEON)
3052 auto rcp = [](float32x4_t v) {
3053 auto est = vrecpeq_f32(v);
3054 return vrecpsq_f32(v,est)*est;
3055 };
3056 float32x4_t lo,hi;
3057 split(x, &lo,&hi);
3058 return join<F>(rcp(lo), rcp(hi));
3059 #else
3060 return 1.0f / x;
3061 #endif
3062 }
3063 SI F sqrt_(F x) {
3064 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
3065 __m256 lo,hi;
3066 split(x, &lo,&hi);
3067 return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi));
3068 #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
3069 __m128 lo,hi;
3070 split(x, &lo,&hi);
3071 return join<F>(_mm_sqrt_ps(lo), _mm_sqrt_ps(hi));
3072 #elif defined(SK_CPU_ARM64)
3073 float32x4_t lo,hi;
3074 split(x, &lo,&hi);
3075 return join<F>(vsqrtq_f32(lo), vsqrtq_f32(hi));
3076 #elif defined(JUMPER_IS_NEON)
3077 auto sqrt = [](float32x4_t v) {
3078 auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v).
3079 est *= vrsqrtsq_f32(v,est*est);
3080 est *= vrsqrtsq_f32(v,est*est);
3081 return v*est; // sqrt(v) == v*rsqrt(v).
3082 };
3083 float32x4_t lo,hi;
3084 split(x, &lo,&hi);
3085 return join<F>(sqrt(lo), sqrt(hi));
3086 #else
3087 return F{
3088 sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]),
3089 sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]),
3090 };
3091 #endif
3092 }
3093
3094 SI F floor_(F x) {
3095 #if defined(SK_CPU_ARM64)
3096 float32x4_t lo,hi;
3097 split(x, &lo,&hi);
3098 return join<F>(vrndmq_f32(lo), vrndmq_f32(hi));
3099 #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
3100 __m256 lo,hi;
3101 split(x, &lo,&hi);
3102 return join<F>(_mm256_floor_ps(lo), _mm256_floor_ps(hi));
3103 #elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
3104 __m128 lo,hi;
3105 split(x, &lo,&hi);
3106 return join<F>(_mm_floor_ps(lo), _mm_floor_ps(hi));
3107 #else
3108 F roundtrip = cast<F>(cast<I32>(x));
3109 return roundtrip - if_then_else(roundtrip > x, F(1), F(0));
3110 #endif
3111 }
3112 SI F fract(F x) { return x - floor_(x); }
3113 SI F abs_(F x) { return bit_cast<F>( bit_cast<I32>(x) & 0x7fffffff ); }
3114
3115 // ~~~~~~ Basic / misc. stages ~~~~~~ //
3116
3117 STAGE_GG(seed_shader, Ctx::None) {
3118 static const float iota[] = {
3119 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
3120 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
3121 };
3122 x = cast<F>(I32(dx)) + sk_unaligned_load<F>(iota);
3123 y = cast<F>(I32(dy)) + 0.5f;
3124 }
3125
3126 STAGE_GG(matrix_translate, const float* m) {
3127 x += m[0];
3128 y += m[1];
3129 }
3130 STAGE_GG(matrix_scale_translate, const float* m) {
3131 x = mad(x,m[0], m[2]);
3132 y = mad(y,m[1], m[3]);
3133 }
3134 STAGE_GG(matrix_2x3, const float* m) {
3135 auto X = mad(x,m[0], mad(y,m[2], m[4])),
3136 Y = mad(x,m[1], mad(y,m[3], m[5]));
3137 x = X;
3138 y = Y;
3139 }
3140 STAGE_GG(matrix_perspective, const float* m) {
3141 // N.B. Unlike the other matrix_ stages, this matrix is row-major.
3142 auto X = mad(x,m[0], mad(y,m[1], m[2])),
3143 Y = mad(x,m[3], mad(y,m[4], m[5])),
3144 Z = mad(x,m[6], mad(y,m[7], m[8]));
3145 x = X * rcp(Z);
3146 y = Y * rcp(Z);
3147 }
3148
3149 STAGE_PP(uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
3150 r = c->rgba[0];
3151 g = c->rgba[1];
3152 b = c->rgba[2];
3153 a = c->rgba[3];
3154 }
3155 STAGE_PP(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
3156 dr = c->rgba[0];
3157 dg = c->rgba[1];
3158 db = c->rgba[2];
3159 da = c->rgba[3];
3160 }
3161 STAGE_PP(black_color, Ctx::None) { r = g = b = 0; a = 255; }
3162 STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; }
3163
3164 STAGE_PP(set_rgb, const float rgb[3]) {
3165 r = from_float(rgb[0]);
3166 g = from_float(rgb[1]);
3167 b = from_float(rgb[2]);
3168 }
3169
3170 STAGE_PP(clamp_0, Ctx::None) { /*definitely a noop*/ }
3171 STAGE_PP(clamp_1, Ctx::None) { /*_should_ be a noop*/ }
3172
3173 STAGE_PP(clamp_a, Ctx::None) {
3174 r = min(r, a);
3175 g = min(g, a);
3176 b = min(b, a);
3177 }
3178
3179 STAGE_PP(clamp_gamut, Ctx::None) {
3180 // It shouldn't be possible to get out-of-gamut
3181 // colors when working in lowp.
3182 }
3183
3184 STAGE_PP(premul, Ctx::None) {
3185 r = div255(r * a);
3186 g = div255(g * a);
3187 b = div255(b * a);
3188 }
3189 STAGE_PP(premul_dst, Ctx::None) {
3190 dr = div255(dr * da);
3191 dg = div255(dg * da);
3192 db = div255(db * da);
3193 }
3194
3195 STAGE_PP(force_opaque , Ctx::None) { a = 255; }
3196 STAGE_PP(force_opaque_dst, Ctx::None) { da = 255; }
3197
3198 STAGE_PP(swap_rb, Ctx::None) {
3199 auto tmp = r;
3200 r = b;
3201 b = tmp;
3202 }
3203 STAGE_PP(swap_rb_dst, Ctx::None) {
3204 auto tmp = dr;
3205 dr = db;
3206 db = tmp;
3207 }
3208
3209 STAGE_PP(move_src_dst, Ctx::None) {
3210 dr = r;
3211 dg = g;
3212 db = b;
3213 da = a;
3214 }
3215
3216 STAGE_PP(move_dst_src, Ctx::None) {
3217 r = dr;
3218 g = dg;
3219 b = db;
3220 a = da;
3221 }
3222
3223 // ~~~~~~ Blend modes ~~~~~~ //
3224
3225 // The same logic applied to all 4 channels.
3226 #define BLEND_MODE(name) \
3227 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
3228 STAGE_PP(name, Ctx::None) { \
3229 r = name##_channel(r,dr,a,da); \
3230 g = name##_channel(g,dg,a,da); \
3231 b = name##_channel(b,db,a,da); \
3232 a = name##_channel(a,da,a,da); \
3233 } \
3234 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
3235
3236 BLEND_MODE(clear) { return 0; }
3237 BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); }
3238 BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); }
3239 BLEND_MODE(srcin) { return div255( s*da ); }
3240 BLEND_MODE(dstin) { return div255( d*sa ); }
3241 BLEND_MODE(srcout) { return div255( s*inv(da) ); }
3242 BLEND_MODE(dstout) { return div255( d*inv(sa) ); }
3243 BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); }
3244 BLEND_MODE(dstover) { return d + div255( s*inv(da) ); }
3245 BLEND_MODE(modulate) { return div255( s*d ); }
3246 BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); }
3247 BLEND_MODE(plus_) { return min(s+d, 255); }
3248 BLEND_MODE(screen) { return s + d - div255( s*d ); }
3249 BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); }
3250 #undef BLEND_MODE
3251
3252 // The same logic applied to color, and srcover for alpha.
3253 #define BLEND_MODE(name) \
3254 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
3255 STAGE_PP(name, Ctx::None) { \
3256 r = name##_channel(r,dr,a,da); \
3257 g = name##_channel(g,dg,a,da); \
3258 b = name##_channel(b,db,a,da); \
3259 a = a + div255( da*inv(a) ); \
3260 } \
3261 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
3262
3263 BLEND_MODE(darken) { return s + d - div255( max(s*da, d*sa) ); }
3264 BLEND_MODE(lighten) { return s + d - div255( min(s*da, d*sa) ); }
3265 BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); }
3266 BLEND_MODE(exclusion) { return s + d - 2*div255( s*d ); }
3267
3268 BLEND_MODE(hardlight) {
3269 return div255( s*inv(da) + d*inv(sa) +
3270 if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
3271 }
3272 BLEND_MODE(overlay) {
3273 return div255( s*inv(da) + d*inv(sa) +
3274 if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
3275 }
3276 #undef BLEND_MODE
3277
3278 // ~~~~~~ Helpers for interacting with memory ~~~~~~ //
3279
3280 template <typename T>
3281 SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) {
3282 return (T*)ctx->pixels + dy*ctx->stride + dx;
3283 }
3284
3285 template <typename T>
3286 SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
3287 auto clamp = [](F v, F limit) {
3288 limit = bit_cast<F>( bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive.
3289 return min(max(0, v), limit);
3290 };
3291 x = clamp(x, ctx->width);
3292 y = clamp(y, ctx->height);
3293
3294 *ptr = (const T*)ctx->pixels;
3295 return trunc_(y)*ctx->stride + trunc_(x);
3296 }
3297
3298 template <typename V, typename T>
3299 SI V load(const T* ptr, size_t tail) {
3300 V v = 0;
3301 switch (tail & (N-1)) {
3302 case 0: memcpy(&v, ptr, sizeof(v)); break;
3303 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
3304 case 15: v[14] = ptr[14];
3305 case 14: v[13] = ptr[13];
3306 case 13: v[12] = ptr[12];
3307 case 12: memcpy(&v, ptr, 12*sizeof(T)); break;
3308 case 11: v[10] = ptr[10];
3309 case 10: v[ 9] = ptr[ 9];
3310 case 9: v[ 8] = ptr[ 8];
3311 case 8: memcpy(&v, ptr, 8*sizeof(T)); break;
3312 #endif
3313 case 7: v[ 6] = ptr[ 6];
3314 case 6: v[ 5] = ptr[ 5];
3315 case 5: v[ 4] = ptr[ 4];
3316 case 4: memcpy(&v, ptr, 4*sizeof(T)); break;
3317 case 3: v[ 2] = ptr[ 2];
3318 case 2: memcpy(&v, ptr, 2*sizeof(T)); break;
3319 case 1: v[ 0] = ptr[ 0];
3320 }
3321 return v;
3322 }
3323 template <typename V, typename T>
3324 SI void store(T* ptr, size_t tail, V v) {
3325 switch (tail & (N-1)) {
3326 case 0: memcpy(ptr, &v, sizeof(v)); break;
3327 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
3328 case 15: ptr[14] = v[14];
3329 case 14: ptr[13] = v[13];
3330 case 13: ptr[12] = v[12];
3331 case 12: memcpy(ptr, &v, 12*sizeof(T)); break;
3332 case 11: ptr[10] = v[10];
3333 case 10: ptr[ 9] = v[ 9];
3334 case 9: ptr[ 8] = v[ 8];
3335 case 8: memcpy(ptr, &v, 8*sizeof(T)); break;
3336 #endif
3337 case 7: ptr[ 6] = v[ 6];
3338 case 6: ptr[ 5] = v[ 5];
3339 case 5: ptr[ 4] = v[ 4];
3340 case 4: memcpy(ptr, &v, 4*sizeof(T)); break;
3341 case 3: ptr[ 2] = v[ 2];
3342 case 2: memcpy(ptr, &v, 2*sizeof(T)); break;
3343 case 1: ptr[ 0] = v[ 0];
3344 }
3345 }
3346
3347 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
3348 template <typename V, typename T>
3349 SI V gather(const T* ptr, U32 ix) {
3350 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
3351 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
3352 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
3353 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
3354 }
3355
3356 template<>
3357 F gather(const float* ptr, U32 ix) {
3358 __m256i lo, hi;
3359 split(ix, &lo, &hi);
3360
3361 return join<F>(_mm256_i32gather_ps(ptr, lo, 4),
3362 _mm256_i32gather_ps(ptr, hi, 4));
3363 }
3364
3365 template<>
3366 U32 gather(const uint32_t* ptr, U32 ix) {
3367 __m256i lo, hi;
3368 split(ix, &lo, &hi);
3369
3370 return join<U32>(_mm256_i32gather_epi32(ptr, lo, 4),
3371 _mm256_i32gather_epi32(ptr, hi, 4));
3372 }
3373 #else
3374 template <typename V, typename T>
3375 SI V gather(const T* ptr, U32 ix) {
3376 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
3377 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
3378 }
3379 #endif
3380
3381
3382 // ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
3383
3384 SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
3385 #if 1 && defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
3386 // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
3387 __m256i _01,_23;
3388 split(rgba, &_01, &_23);
3389 __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20),
3390 _13 = _mm256_permute2x128_si256(_01,_23, 0x31);
3391 rgba = join<U32>(_02, _13);
3392
3393 auto cast_U16 = [](U32 v) -> U16 {
3394 __m256i _02,_13;
3395 split(v, &_02,&_13);
3396 return _mm256_packus_epi32(_02,_13);
3397 };
3398 #else
3399 auto cast_U16 = [](U32 v) -> U16 {
3400 return cast<U16>(v);
3401 };
3402 #endif
3403 *r = cast_U16(rgba & 65535) & 255;
3404 *g = cast_U16(rgba & 65535) >> 8;
3405 *b = cast_U16(rgba >> 16) & 255;
3406 *a = cast_U16(rgba >> 16) >> 8;
3407 }
3408
3409 SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
3410 #if 1 && defined(JUMPER_IS_NEON)
3411 uint8x8x4_t rgba;
3412 switch (tail & (N-1)) {
3413 case 0: rgba = vld4_u8 ((const uint8_t*)(ptr+0) ); break;
3414 case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6);
3415 case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5);
3416 case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4);
3417 case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3);
3418 case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2);
3419 case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1);
3420 case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0);
3421 }
3422 *r = cast<U16>(rgba.val[0]);
3423 *g = cast<U16>(rgba.val[1]);
3424 *b = cast<U16>(rgba.val[2]);
3425 *a = cast<U16>(rgba.val[3]);
3426 #else
3427 from_8888(load<U32>(ptr, tail), r,g,b,a);
3428 #endif
3429 }
3430 SI void store_8888_(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
3431 #if 1 && defined(JUMPER_IS_NEON)
3432 uint8x8x4_t rgba = {{
3433 cast<U8>(r),
3434 cast<U8>(g),
3435 cast<U8>(b),
3436 cast<U8>(a),
3437 }};
3438 switch (tail & (N-1)) {
3439 case 0: vst4_u8 ((uint8_t*)(ptr+0), rgba ); break;
3440 case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6);
3441 case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5);
3442 case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4);
3443 case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3);
3444 case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2);
3445 case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1);
3446 case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0);
3447 }
3448 #else
3449 store(ptr, tail, cast<U32>(r | (g<<8)) << 0
3450 | cast<U32>(b | (a<<8)) << 16);
3451 #endif
3452 }
3453
3454 STAGE_PP(load_8888, const SkRasterPipeline_MemoryCtx* ctx) {
3455 load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
3456 }
3457 STAGE_PP(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3458 load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
3459 }
3460 STAGE_PP(store_8888, const SkRasterPipeline_MemoryCtx* ctx) {
3461 store_8888_(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a);
3462 }
3463 STAGE_GP(gather_8888, const SkRasterPipeline_GatherCtx* ctx) {
3464 const uint32_t* ptr;
3465 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3466 from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
3467 }
3468
3469 // ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
3470
3471 SI void from_565(U16 rgb, U16* r, U16* g, U16* b) {
3472 // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0
3473 U16 R = (rgb >> 11) & 31,
3474 G = (rgb >> 5) & 63,
3475 B = (rgb >> 0) & 31;
3476
3477 // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit.
3478 *r = (R << 3) | (R >> 2);
3479 *g = (G << 2) | (G >> 4);
3480 *b = (B << 3) | (B >> 2);
3481 }
3482 SI void load_565_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
3483 from_565(load<U16>(ptr, tail), r,g,b);
3484 }
3485 SI void store_565_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
3486 // Round from [0,255] to [0,31] or [0,63], as if x * (31/255.0f) + 0.5f.
3487 // (Don't feel like you need to find some fundamental truth in these...
3488 // they were brute-force searched.)
3489 U16 R = (r * 9 + 36) / 74, // 9/74 ≈ 31/255, plus 36/74, about half.
3490 G = (g * 21 + 42) / 85, // 21/85 = 63/255 exactly.
3491 B = (b * 9 + 36) / 74;
3492 // Pack them back into 15|rrrrr gggggg bbbbb|0.
3493 store(ptr, tail, R << 11
3494 | G << 5
3495 | B << 0);
3496 }
3497
3498 STAGE_PP(load_565, const SkRasterPipeline_MemoryCtx* ctx) {
3499 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b);
3500 a = 255;
3501 }
3502 STAGE_PP(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3503 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db);
3504 da = 255;
3505 }
3506 STAGE_PP(store_565, const SkRasterPipeline_MemoryCtx* ctx) {
3507 store_565_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b);
3508 }
3509 STAGE_GP(gather_565, const SkRasterPipeline_GatherCtx* ctx) {
3510 const uint16_t* ptr;
3511 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3512 from_565(gather<U16>(ptr, ix), &r, &g, &b);
3513 a = 255;
3514 }
3515
3516 SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) {
3517 // Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0.
3518 U16 R = (rgba >> 12) & 15,
3519 G = (rgba >> 8) & 15,
3520 B = (rgba >> 4) & 15,
3521 A = (rgba >> 0) & 15;
3522
3523 // Scale [0,15] to [0,255].
3524 *r = (R << 4) | R;
3525 *g = (G << 4) | G;
3526 *b = (B << 4) | B;
3527 *a = (A << 4) | A;
3528 }
3529 SI void load_4444_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
3530 from_4444(load<U16>(ptr, tail), r,g,b,a);
3531 }
3532 SI void store_4444_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
3533 // Round from [0,255] to [0,15], producing the same value as (x*(15/255.0f) + 0.5f).
3534 U16 R = (r + 8) / 17,
3535 G = (g + 8) / 17,
3536 B = (b + 8) / 17,
3537 A = (a + 8) / 17;
3538 // Pack them back into 15|rrrr gggg bbbb aaaa|0.
3539 store(ptr, tail, R << 12
3540 | G << 8
3541 | B << 4
3542 | A << 0);
3543 }
3544
3545 STAGE_PP(load_4444, const SkRasterPipeline_MemoryCtx* ctx) {
3546 load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
3547 }
3548 STAGE_PP(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3549 load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
3550 }
3551 STAGE_PP(store_4444, const SkRasterPipeline_MemoryCtx* ctx) {
3552 store_4444_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a);
3553 }
3554 STAGE_GP(gather_4444, const SkRasterPipeline_GatherCtx* ctx) {
3555 const uint16_t* ptr;
3556 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3557 from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a);
3558 }
3559
3560 SI void from_88(U16 rg, U16* r, U16* g) {
3561 *r = (rg & 0xFF);
3562 *g = (rg >> 8);
3563 }
3564
3565 SI void load_88_(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
3566 #if 1 && defined(JUMPER_IS_NEON)
3567 uint8x8x2_t rg;
3568 switch (tail & (N-1)) {
3569 case 0: rg = vld2_u8 ((const uint8_t*)(ptr+0) ); break;
3570 case 7: rg = vld2_lane_u8((const uint8_t*)(ptr+6), rg, 6);
3571 case 6: rg = vld2_lane_u8((const uint8_t*)(ptr+5), rg, 5);
3572 case 5: rg = vld2_lane_u8((const uint8_t*)(ptr+4), rg, 4);
3573 case 4: rg = vld2_lane_u8((const uint8_t*)(ptr+3), rg, 3);
3574 case 3: rg = vld2_lane_u8((const uint8_t*)(ptr+2), rg, 2);
3575 case 2: rg = vld2_lane_u8((const uint8_t*)(ptr+1), rg, 1);
3576 case 1: rg = vld2_lane_u8((const uint8_t*)(ptr+0), rg, 0);
3577 }
3578 *r = cast<U16>(rg.val[0]);
3579 *g = cast<U16>(rg.val[1]);
3580 #else
3581 from_88(load<U16>(ptr, tail), r,g);
3582 #endif
3583 }
3584
3585 SI void store_88_(uint16_t* ptr, size_t tail, U16 r, U16 g) {
3586 #if 1 && defined(JUMPER_IS_NEON)
3587 uint8x8x2_t rg = {{
3588 cast<U8>(r),
3589 cast<U8>(g),
3590 }};
3591 switch (tail & (N-1)) {
3592 case 0: vst2_u8 ((uint8_t*)(ptr+0), rg ); break;
3593 case 7: vst2_lane_u8((uint8_t*)(ptr+6), rg, 6);
3594 case 6: vst2_lane_u8((uint8_t*)(ptr+5), rg, 5);
3595 case 5: vst2_lane_u8((uint8_t*)(ptr+4), rg, 4);
3596 case 4: vst2_lane_u8((uint8_t*)(ptr+3), rg, 3);
3597 case 3: vst2_lane_u8((uint8_t*)(ptr+2), rg, 2);
3598 case 2: vst2_lane_u8((uint8_t*)(ptr+1), rg, 1);
3599 case 1: vst2_lane_u8((uint8_t*)(ptr+0), rg, 0);
3600 }
3601 #else
3602 store(ptr, tail, cast<U16>(r | (g<<8)) << 0);
3603 #endif
3604 }
3605
3606 STAGE_PP(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
3607 b = 0;
3608 a = 255;
3609 load_88_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g);
3610 }
3611 STAGE_PP(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
3612 store_88_(ptr_at_xy<uint16_t>(ctx, dx, dy), tail, r, g);
3613 }
3614
3615 // ~~~~~~ 8-bit memory loads and stores ~~~~~~ //
3616
3617 SI U16 load_8(const uint8_t* ptr, size_t tail) {
3618 return cast<U16>(load<U8>(ptr, tail));
3619 }
3620 SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
3621 store(ptr, tail, cast<U8>(v));
3622 }
3623
3624 STAGE_PP(load_a8, const SkRasterPipeline_MemoryCtx* ctx) {
3625 r = g = b = 0;
3626 a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3627 }
3628 STAGE_PP(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3629 dr = dg = db = 0;
3630 da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3631 }
3632 STAGE_PP(store_a8, const SkRasterPipeline_MemoryCtx* ctx) {
3633 store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a);
3634 }
3635 STAGE_GP(gather_a8, const SkRasterPipeline_GatherCtx* ctx) {
3636 const uint8_t* ptr;
3637 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3638 r = g = b = 0;
3639 a = cast<U16>(gather<U8>(ptr, ix));
3640 }
3641
3642 STAGE_PP(alpha_to_gray, Ctx::None) {
3643 r = g = b = a;
3644 a = 255;
3645 }
3646 STAGE_PP(alpha_to_gray_dst, Ctx::None) {
3647 dr = dg = db = da;
3648 da = 255;
3649 }
3650 STAGE_PP(bt709_luminance_or_luma_to_alpha, Ctx::None) {
3651 a = (r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator.
3652 r = g = b = 0;
3653 }
3654
3655 // ~~~~~~ Coverage scales / lerps ~~~~~~ //
3656
3657 STAGE_PP(load_src, const uint16_t* ptr) {
3658 r = sk_unaligned_load<U16>(ptr + 0*N);
3659 g = sk_unaligned_load<U16>(ptr + 1*N);
3660 b = sk_unaligned_load<U16>(ptr + 2*N);
3661 a = sk_unaligned_load<U16>(ptr + 3*N);
3662 }
3663 STAGE_PP(store_src, uint16_t* ptr) {
3664 sk_unaligned_store(ptr + 0*N, r);
3665 sk_unaligned_store(ptr + 1*N, g);
3666 sk_unaligned_store(ptr + 2*N, b);
3667 sk_unaligned_store(ptr + 3*N, a);
3668 }
3669 STAGE_PP(load_dst, const uint16_t* ptr) {
3670 dr = sk_unaligned_load<U16>(ptr + 0*N);
3671 dg = sk_unaligned_load<U16>(ptr + 1*N);
3672 db = sk_unaligned_load<U16>(ptr + 2*N);
3673 da = sk_unaligned_load<U16>(ptr + 3*N);
3674 }
3675 STAGE_PP(store_dst, uint16_t* ptr) {
3676 sk_unaligned_store(ptr + 0*N, dr);
3677 sk_unaligned_store(ptr + 1*N, dg);
3678 sk_unaligned_store(ptr + 2*N, db);
3679 sk_unaligned_store(ptr + 3*N, da);
3680 }
3681
3682 // ~~~~~~ Coverage scales / lerps ~~~~~~ //
3683
3684 STAGE_PP(scale_1_float, const float* f) {
3685 U16 c = from_float(*f);
3686 r = div255( r * c );
3687 g = div255( g * c );
3688 b = div255( b * c );
3689 a = div255( a * c );
3690 }
3691 STAGE_PP(lerp_1_float, const float* f) {
3692 U16 c = from_float(*f);
3693 r = lerp(dr, r, c);
3694 g = lerp(dg, g, c);
3695 b = lerp(db, b, c);
3696 a = lerp(da, a, c);
3697 }
3698 STAGE_PP(lerp_native, const uint16_t scales[]) {
3699 auto c = sk_unaligned_load<U16>(scales);
3700 r = lerp(dr, r, c);
3701 g = lerp(dg, g, c);
3702 b = lerp(db, b, c);
3703 a = lerp(da, a, c);
3704 }
3705
3706 STAGE_PP(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) {
3707 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3708 r = div255( r * c );
3709 g = div255( g * c );
3710 b = div255( b * c );
3711 a = div255( a * c );
3712 }
3713 STAGE_PP(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) {
3714 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3715 r = lerp(dr, r, c);
3716 g = lerp(dg, g, c);
3717 b = lerp(db, b, c);
3718 a = lerp(da, a, c);
3719 }
3720
3721 // Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
3722 SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) {
3723 return if_then_else(a < da, min(cr,cg,cb)
3724 , max(cr,cg,cb));
3725 }
3726 STAGE_PP(scale_565, const SkRasterPipeline_MemoryCtx* ctx) {
3727 U16 cr,cg,cb;
3728 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
3729 U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
3730
3731 r = div255( r * cr );
3732 g = div255( g * cg );
3733 b = div255( b * cb );
3734 a = div255( a * ca );
3735 }
3736 STAGE_PP(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) {
3737 U16 cr,cg,cb;
3738 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
3739 U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
3740
3741 r = lerp(dr, r, cr);
3742 g = lerp(dg, g, cg);
3743 b = lerp(db, b, cb);
3744 a = lerp(da, a, ca);
3745 }
3746
3747 STAGE_PP(emboss, const SkRasterPipeline_EmbossCtx* ctx) {
3748 U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), tail),
3749 add = load_8(ptr_at_xy<const uint8_t>(&ctx->add, dx,dy), tail);
3750
3751 r = min(div255(r*mul) + add, a);
3752 g = min(div255(g*mul) + add, a);
3753 b = min(div255(b*mul) + add, a);
3754 }
3755
3756
3757 // ~~~~~~ Gradient stages ~~~~~~ //
3758
3759 // Clamp x to [0,1], both sides inclusive (think, gradients).
3760 // Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
3761 SI F clamp_01(F v) { return min(max(0, v), 1); }
3762
3763 STAGE_GG(clamp_x_1 , Ctx::None) { x = clamp_01(x); }
3764 STAGE_GG(repeat_x_1, Ctx::None) { x = clamp_01(x - floor_(x)); }
3765 STAGE_GG(mirror_x_1, Ctx::None) {
3766 auto two = [](F x){ return x+x; };
3767 x = clamp_01(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f ));
3768 }
3769
3770 SI I16 cond_to_mask_16(I32 cond) { return cast<I16>(cond); }
3771
3772 STAGE_GG(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
3773 auto w = ctx->limit_x;
3774 sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w)));
3775 }
3776 STAGE_GG(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
3777 auto h = ctx->limit_y;
3778 sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h)));
3779 }
3780 STAGE_GG(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
3781 auto w = ctx->limit_x;
3782 auto h = ctx->limit_y;
3783 sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h)));
3784 }
3785 STAGE_PP(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
3786 auto mask = sk_unaligned_load<U16>(ctx->mask);
3787 r = r & mask;
3788 g = g & mask;
3789 b = b & mask;
3790 a = a & mask;
3791 }
3792
3793 SI void round_F_to_U16(F R, F G, F B, F A, bool interpolatedInPremul,
3794 U16* r, U16* g, U16* b, U16* a) {
3795 auto round = [](F x) { return cast<U16>(x * 255.0f + 0.5f); };
3796
3797 F limit = interpolatedInPremul ? A
3798 : 1;
3799 *r = round(min(max(0,R), limit));
3800 *g = round(min(max(0,G), limit));
3801 *b = round(min(max(0,B), limit));
3802 *a = round(A); // we assume alpha is already in [0,1].
3803 }
3804
3805 SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t,
3806 U16* r, U16* g, U16* b, U16* a) {
3807
3808 F fr, fg, fb, fa, br, bg, bb, ba;
3809 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
3810 if (c->stopCount <=8) {
3811 __m256i lo, hi;
3812 split(idx, &lo, &hi);
3813
3814 fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), lo),
3815 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), hi));
3816 br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), lo),
3817 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), hi));
3818 fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), lo),
3819 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), hi));
3820 bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), lo),
3821 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), hi));
3822 fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), lo),
3823 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), hi));
3824 bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), lo),
3825 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), hi));
3826 fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), lo),
3827 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), hi));
3828 ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), lo),
3829 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), hi));
3830 } else
3831 #endif
3832 {
3833 fr = gather<F>(c->fs[0], idx);
3834 fg = gather<F>(c->fs[1], idx);
3835 fb = gather<F>(c->fs[2], idx);
3836 fa = gather<F>(c->fs[3], idx);
3837 br = gather<F>(c->bs[0], idx);
3838 bg = gather<F>(c->bs[1], idx);
3839 bb = gather<F>(c->bs[2], idx);
3840 ba = gather<F>(c->bs[3], idx);
3841 }
3842 round_F_to_U16(mad(t, fr, br),
3843 mad(t, fg, bg),
3844 mad(t, fb, bb),
3845 mad(t, fa, ba),
3846 c->interpolatedInPremul,
3847 r,g,b,a);
3848 }
3849
3850 STAGE_GP(gradient, const SkRasterPipeline_GradientCtx* c) {
3851 auto t = x;
3852 U32 idx = 0;
3853
3854 // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
3855 for (size_t i = 1; i < c->stopCount; i++) {
3856 idx += if_then_else(t >= c->ts[i], U32(1), U32(0));
3857 }
3858
3859 gradient_lookup(c, idx, t, &r, &g, &b, &a);
3860 }
3861
3862 STAGE_GP(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) {
3863 auto t = x;
3864 auto idx = trunc_(t * (c->stopCount-1));
3865 gradient_lookup(c, idx, t, &r, &g, &b, &a);
3866 }
3867
3868 STAGE_GP(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) {
3869 auto t = x;
3870 round_F_to_U16(mad(t, c->f[0], c->b[0]),
3871 mad(t, c->f[1], c->b[1]),
3872 mad(t, c->f[2], c->b[2]),
3873 mad(t, c->f[3], c->b[3]),
3874 c->interpolatedInPremul,
3875 &r,&g,&b,&a);
3876 }
3877
3878 STAGE_GG(xy_to_unit_angle, Ctx::None) {
3879 F xabs = abs_(x),
3880 yabs = abs_(y);
3881
3882 F slope = min(xabs, yabs)/max(xabs, yabs);
3883 F s = slope * slope;
3884
3885 // Use a 7th degree polynomial to approximate atan.
3886 // This was generated using sollya.gforge.inria.fr.
3887 // A float optimized polynomial was generated using the following command.
3888 // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
3889 F phi = slope
3890 * (0.15912117063999176025390625f + s
3891 * (-5.185396969318389892578125e-2f + s
3892 * (2.476101927459239959716796875e-2f + s
3893 * (-7.0547382347285747528076171875e-3f))));
3894
3895 phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi);
3896 phi = if_then_else(x < 0.0f , 1.0f/2.0f - phi, phi);
3897 phi = if_then_else(y < 0.0f , 1.0f - phi , phi);
3898 phi = if_then_else(phi != phi , 0 , phi); // Check for NaN.
3899 x = phi;
3900 }
3901 STAGE_GG(xy_to_radius, Ctx::None) {
3902 x = sqrt_(x*x + y*y);
3903 }
3904
3905 // ~~~~~~ Compound stages ~~~~~~ //
3906
3907 STAGE_PP(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
3908 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
3909
3910 load_8888_(ptr, tail, &dr,&dg,&db,&da);
3911 r = r + div255( dr*inv(a) );
3912 g = g + div255( dg*inv(a) );
3913 b = b + div255( db*inv(a) );
3914 a = a + div255( da*inv(a) );
3915 store_8888_(ptr, tail, r,g,b,a);
3916 }
3917
3918 #if defined(SK_DISABLE_LOWP_BILERP_CLAMP_CLAMP_STAGE)
3919 static void(*bilerp_clamp_8888)(void) = nullptr;
3920 static void(*bilinear)(void) = nullptr;
3921 #else
3922 STAGE_GP(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
3923 // (cx,cy) are the center of our sample.
3924 F cx = x,
3925 cy = y;
3926
3927 // All sample points are at the same fractional offset (fx,fy).
3928 // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
3929 F fx = fract(cx + 0.5f),
3930 fy = fract(cy + 0.5f);
3931
3932 // We'll accumulate the color of all four samples into {r,g,b,a} directly.
3933 r = g = b = a = 0;
3934
3935 // The first three sample points will calculate their area using math
3936 // just like in the float code above, but the fourth will take up all the rest.
3937 //
3938 // Logically this is the same as doing the math for the fourth pixel too,
3939 // but rounding error makes this a better strategy, keeping opaque opaque, etc.
3940 //
3941 // We can keep up to 8 bits of fractional precision without overflowing 16-bit,
3942 // so our "1.0" area is 256.
3943 const uint16_t bias = 256;
3944 U16 remaining = bias;
3945
3946 for (float dy = -0.5f; dy <= +0.5f; dy += 1.0f)
3947 for (float dx = -0.5f; dx <= +0.5f; dx += 1.0f) {
3948 // (x,y) are the coordinates of this sample point.
3949 F x = cx + dx,
3950 y = cy + dy;
3951
3952 // ix_and_ptr() will clamp to the image's bounds for us.
3953 const uint32_t* ptr;
3954 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3955
3956 U16 sr,sg,sb,sa;
3957 from_8888(gather<U32>(ptr, ix), &sr,&sg,&sb,&sa);
3958
3959 // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
3960 // are combined in direct proportion to their area overlapping that logical query pixel.
3961 // At positive offsets, the x-axis contribution to that rectangle is fx,
3962 // or (1-fx) at negative x. Same deal for y.
3963 F sx = (dx > 0) ? fx : 1.0f - fx,
3964 sy = (dy > 0) ? fy : 1.0f - fy;
3965
3966 U16 area = (dy == 0.5f && dx == 0.5f) ? remaining
3967 : cast<U16>(sx * sy * bias);
3968 for (size_t i = 0; i < N; i++) {
3969 SkASSERT(remaining[i] >= area[i]);
3970 }
3971 remaining -= area;
3972
3973 r += sr * area;
3974 g += sg * area;
3975 b += sb * area;
3976 a += sa * area;
3977 }
3978
3979 r = (r + bias/2) / bias;
3980 g = (g + bias/2) / bias;
3981 b = (b + bias/2) / bias;
3982 a = (a + bias/2) / bias;
3983 }
3984
3985 // TODO: lowp::tile() is identical to the highp tile()... share?
3986 SI F tile(F v, SkTileMode mode, float limit, float invLimit) {
3987 // After ix_and_ptr() will clamp the output of tile(), so we need not clamp here.
3988 switch (mode) {
3989 case SkTileMode::kDecal: // TODO, for now fallthrough to clamp
3990 case SkTileMode::kClamp: return v;
3991 case SkTileMode::kRepeat: return v - floor_(v*invLimit)*limit;
3992 case SkTileMode::kMirror:
3993 return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit );
3994 }
3995 SkUNREACHABLE;
3996 }
3997
3998 SI void sample(const SkRasterPipeline_SamplerCtx2* ctx, F x, F y,
3999 U16* r, U16* g, U16* b, U16* a) {
4000 x = tile(x, ctx->tileX, ctx->width , ctx->invWidth );
4001 y = tile(y, ctx->tileY, ctx->height, ctx->invHeight);
4002
4003 switch (ctx->ct) {
4004 default: *r = *g = *b = *a = 0; // TODO
4005 break;
4006
4007 case kRGBA_8888_SkColorType:
4008 case kBGRA_8888_SkColorType: {
4009 const uint32_t* ptr;
4010 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
4011 from_8888(gather<U32>(ptr, ix), r,g,b,a);
4012 if (ctx->ct == kBGRA_8888_SkColorType) {
4013 std::swap(*r,*b);
4014 }
4015 } break;
4016 }
4017 }
4018
4019 template <int D>
4020 SI void sampler(const SkRasterPipeline_SamplerCtx2* ctx,
4021 F cx, F cy, const F (&wx)[D], const F (&wy)[D],
4022 U16* r, U16* g, U16* b, U16* a) {
4023
4024 float start = -0.5f*(D-1);
4025
4026 const uint16_t bias = 256;
4027 U16 remaining = bias;
4028
4029 *r = *g = *b = *a = 0;
4030 F y = cy + start;
4031 for (int j = 0; j < D; j++, y += 1.0f) {
4032 F x = cx + start;
4033 for (int i = 0; i < D; i++, x += 1.0f) {
4034 U16 R,G,B,A;
4035 sample(ctx, x,y, &R,&G,&B,&A);
4036
4037 U16 w = (i == D-1 && j == D-1) ? remaining
4038 : cast<U16>(wx[i]*wy[j]*bias);
4039 remaining -= w;
4040 *r += w*R;
4041 *g += w*G;
4042 *b += w*B;
4043 *a += w*A;
4044 }
4045 }
4046 *r = (*r + bias/2) / bias;
4047 *g = (*g + bias/2) / bias;
4048 *b = (*b + bias/2) / bias;
4049 *a = (*a + bias/2) / bias;
4050 }
4051
4052 STAGE_GP(bilinear, const SkRasterPipeline_SamplerCtx2* ctx) {
4053 F fx = fract(x + 0.5f),
4054 fy = fract(y + 0.5f);
4055 const F wx[] = {1.0f - fx, fx};
4056 const F wy[] = {1.0f - fy, fy};
4057
4058 sampler(ctx, x,y, wx,wy, &r,&g,&b,&a);
4059 }
4060 #endif
4061
4062 // ~~~~~~ GrSwizzle stage ~~~~~~ //
4063
4064 STAGE_PP(swizzle, void* ctx) {
4065 auto ir = r, ig = g, ib = b, ia = a;
4066 U16* o[] = {&r, &g, &b, &a};
4067 char swiz[4];
4068 memcpy(swiz, &ctx, sizeof(swiz));
4069
4070 for (int i = 0; i < 4; ++i) {
4071 switch (swiz[i]) {
4072 case 'r': *o[i] = ir; break;
4073 case 'g': *o[i] = ig; break;
4074 case 'b': *o[i] = ib; break;
4075 case 'a': *o[i] = ia; break;
4076 case '0': *o[i] = U16(0); break;
4077 case '1': *o[i] = U16(255); break;
4078 default: break;
4079 }
4080 }
4081 }
4082
4083 // Now we'll add null stand-ins for stages we haven't implemented in lowp.
4084 // If a pipeline uses these stages, it'll boot it out of lowp into highp.
4085 #define NOT_IMPLEMENTED(st) static void (*st)(void) = nullptr;
4086 NOT_IMPLEMENTED(callback)
4087 NOT_IMPLEMENTED(interpreter)
4088 NOT_IMPLEMENTED(unbounded_set_rgb)
4089 NOT_IMPLEMENTED(unbounded_uniform_color)
4090 NOT_IMPLEMENTED(unpremul)
4091 NOT_IMPLEMENTED(dither) // TODO
4092 NOT_IMPLEMENTED(from_srgb)
4093 NOT_IMPLEMENTED(to_srgb)
4094 NOT_IMPLEMENTED(load_16161616)
4095 NOT_IMPLEMENTED(store_16161616)
4096 NOT_IMPLEMENTED(load_a16)
4097 NOT_IMPLEMENTED(store_a16)
4098 NOT_IMPLEMENTED(load_rg1616)
4099 NOT_IMPLEMENTED(store_rg1616)
4100 NOT_IMPLEMENTED(load_f16)
4101 NOT_IMPLEMENTED(load_f16_dst)
4102 NOT_IMPLEMENTED(store_f16)
4103 NOT_IMPLEMENTED(gather_f16)
4104 NOT_IMPLEMENTED(load_af16)
4105 NOT_IMPLEMENTED(store_af16)
4106 NOT_IMPLEMENTED(load_rgf16)
4107 NOT_IMPLEMENTED(store_rgf16)
4108 NOT_IMPLEMENTED(load_f32)
4109 NOT_IMPLEMENTED(load_f32_dst)
4110 NOT_IMPLEMENTED(store_f32)
4111 NOT_IMPLEMENTED(gather_f32)
4112 NOT_IMPLEMENTED(load_rgf32)
4113 NOT_IMPLEMENTED(store_rgf32)
4114 NOT_IMPLEMENTED(load_1010102)
4115 NOT_IMPLEMENTED(load_1010102_dst)
4116 NOT_IMPLEMENTED(store_1010102)
4117 NOT_IMPLEMENTED(gather_1010102)
4118 NOT_IMPLEMENTED(store_u16_be)
4119 NOT_IMPLEMENTED(byte_tables) // TODO
4120 NOT_IMPLEMENTED(colorburn)
4121 NOT_IMPLEMENTED(colordodge)
4122 NOT_IMPLEMENTED(softlight)
4123 NOT_IMPLEMENTED(hue)
4124 NOT_IMPLEMENTED(saturation)
4125 NOT_IMPLEMENTED(color)
4126 NOT_IMPLEMENTED(luminosity)
4127 NOT_IMPLEMENTED(matrix_3x3)
4128 NOT_IMPLEMENTED(matrix_3x4)
4129 NOT_IMPLEMENTED(matrix_4x5) // TODO
4130 NOT_IMPLEMENTED(matrix_4x3) // TODO
4131 NOT_IMPLEMENTED(parametric)
4132 NOT_IMPLEMENTED(gamma_)
4133 NOT_IMPLEMENTED(rgb_to_hsl)
4134 NOT_IMPLEMENTED(hsl_to_rgb)
4135 NOT_IMPLEMENTED(gauss_a_to_rgba) // TODO
4136 NOT_IMPLEMENTED(mirror_x) // TODO
4137 NOT_IMPLEMENTED(repeat_x) // TODO
4138 NOT_IMPLEMENTED(mirror_y) // TODO
4139 NOT_IMPLEMENTED(repeat_y) // TODO
4140 NOT_IMPLEMENTED(negate_x)
4141 NOT_IMPLEMENTED(bicubic) // TODO if I can figure out negative weights
4142 NOT_IMPLEMENTED(bicubic_clamp_8888)
4143 NOT_IMPLEMENTED(bilinear_nx) // TODO
4144 NOT_IMPLEMENTED(bilinear_ny) // TODO
4145 NOT_IMPLEMENTED(bilinear_px) // TODO
4146 NOT_IMPLEMENTED(bilinear_py) // TODO
4147 NOT_IMPLEMENTED(bicubic_n3x) // TODO
4148 NOT_IMPLEMENTED(bicubic_n1x) // TODO
4149 NOT_IMPLEMENTED(bicubic_p1x) // TODO
4150 NOT_IMPLEMENTED(bicubic_p3x) // TODO
4151 NOT_IMPLEMENTED(bicubic_n3y) // TODO
4152 NOT_IMPLEMENTED(bicubic_n1y) // TODO
4153 NOT_IMPLEMENTED(bicubic_p1y) // TODO
4154 NOT_IMPLEMENTED(bicubic_p3y) // TODO
4155 NOT_IMPLEMENTED(save_xy) // TODO
4156 NOT_IMPLEMENTED(accumulate) // TODO
4157 NOT_IMPLEMENTED(xy_to_2pt_conical_well_behaved)
4158 NOT_IMPLEMENTED(xy_to_2pt_conical_strip)
4159 NOT_IMPLEMENTED(xy_to_2pt_conical_focal_on_circle)
4160 NOT_IMPLEMENTED(xy_to_2pt_conical_smaller)
4161 NOT_IMPLEMENTED(xy_to_2pt_conical_greater)
4162 NOT_IMPLEMENTED(alter_2pt_conical_compensate_focal)
4163 NOT_IMPLEMENTED(alter_2pt_conical_unswap)
4164 NOT_IMPLEMENTED(mask_2pt_conical_nan)
4165 NOT_IMPLEMENTED(mask_2pt_conical_degenerates)
4166 NOT_IMPLEMENTED(apply_vector_mask)
4167 #undef NOT_IMPLEMENTED
4168
4169 #endif//defined(JUMPER_IS_SCALAR) controlling whether we build lowp stages
4170 } // namespace lowp
4171
4172 } // namespace SK_OPTS_NS
4173
4174 #endif//SkRasterPipeline_opts_DEFINED
4175