• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2018 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #ifndef SkRasterPipeline_opts_DEFINED
9 #define SkRasterPipeline_opts_DEFINED
10 
11 #include "include/core/SkTypes.h"
12 #include "src/core/SkUtils.h"  // unaligned_{load,store}
13 #include "src/sksl/SkSLByteCode.h"
14 
15 // Every function in this file should be marked static and inline using SI.
16 #if defined(__clang__)
17     #define SI __attribute__((always_inline)) static inline
18 #else
19     #define SI static inline
20 #endif
21 
22 template <typename Dst, typename Src>
bit_cast(const Src & src)23 SI Dst bit_cast(const Src& src) {
24     static_assert(sizeof(Dst) == sizeof(Src), "");
25     return sk_unaligned_load<Dst>(&src);
26 }
27 
28 template <typename Dst, typename Src>
widen_cast(const Src & src)29 SI Dst widen_cast(const Src& src) {
30     static_assert(sizeof(Dst) > sizeof(Src), "");
31     Dst dst;
32     memcpy(&dst, &src, sizeof(Src));
33     return dst;
34 }
35 
36 // Our program is an array of void*, either
37 //   - 1 void* per stage with no context pointer, the next stage;
38 //   - 2 void* per stage with a context pointer, first the context pointer, then the next stage.
39 
40 // load_and_inc() steps the program forward by 1 void*, returning that pointer.
load_and_inc(void ** & program)41 SI void* load_and_inc(void**& program) {
42 #if defined(__GNUC__) && defined(__x86_64__)
43     // If program is in %rsi (we try to make this likely) then this is a single instruction.
44     void* rax;
45     asm("lodsq" : "=a"(rax), "+S"(program));  // Write-only %rax, read-write %rsi.
46     return rax;
47 #else
48     // On ARM *program++ compiles into pretty ideal code without any handholding.
49     return *program++;
50 #endif
51 }
52 
53 // Lazily resolved on first cast.  Does nothing if cast to Ctx::None.
54 struct Ctx {
55     struct None {};
56 
57     void*   ptr;
58     void**& program;
59 
CtxCtx60     explicit Ctx(void**& p) : ptr(nullptr), program(p) {}
61 
62     template <typename T>
63     operator T*() {
64         if (!ptr) { ptr = load_and_inc(program); }
65         return (T*)ptr;
66     }
NoneCtx67     operator None() { return None{}; }
68 };
69 
70 
71 #if !defined(__clang__)
72     #define JUMPER_IS_SCALAR
73 #elif defined(SK_ARM_HAS_NEON)
74     #define JUMPER_IS_NEON
75 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX512
76     #define JUMPER_IS_AVX512
77 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
78     #define JUMPER_IS_HSW
79 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
80     #define JUMPER_IS_AVX
81 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
82     #define JUMPER_IS_SSE41
83 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
84     #define JUMPER_IS_SSE2
85 #else
86     #define JUMPER_IS_SCALAR
87 #endif
88 
89 // Older Clangs seem to crash when generating non-optimized NEON code for ARMv7.
90 #if defined(__clang__) && !defined(__OPTIMIZE__) && defined(SK_CPU_ARM32)
91     // Apple Clang 9 and vanilla Clang 5 are fine, and may even be conservative.
92     #if defined(__apple_build_version__) && __clang_major__ < 9
93         #define JUMPER_IS_SCALAR
94     #elif __clang_major__ < 5
95         #define JUMPER_IS_SCALAR
96     #endif
97 
98     #if defined(JUMPER_IS_NEON) && defined(JUMPER_IS_SCALAR)
99         #undef  JUMPER_IS_NEON
100     #endif
101 #endif
102 
103 #if defined(JUMPER_IS_SCALAR)
104     #include <math.h>
105 #elif defined(JUMPER_IS_NEON)
106     #include <arm_neon.h>
107 #else
108     #include <immintrin.h>
109 #endif
110 
111 namespace SK_OPTS_NS {
112 
113 #if defined(JUMPER_IS_SCALAR)
114     // This path should lead to portable scalar code.
115     using F   = float   ;
116     using I32 =  int32_t;
117     using U64 = uint64_t;
118     using U32 = uint32_t;
119     using U16 = uint16_t;
120     using U8  = uint8_t ;
121 
mad(F f,F m,F a)122     SI F   mad(F f, F m, F a)   { return f*m+a; }
min(F a,F b)123     SI F   min(F a, F b)        { return fminf(a,b); }
max(F a,F b)124     SI F   max(F a, F b)        { return fmaxf(a,b); }
abs_(F v)125     SI F   abs_  (F v)          { return fabsf(v); }
floor_(F v)126     SI F   floor_(F v)          { return floorf(v); }
rcp(F v)127     SI F   rcp   (F v)          { return 1.0f / v; }
rsqrt(F v)128     SI F   rsqrt (F v)          { return 1.0f / sqrtf(v); }
sqrt_(F v)129     SI F    sqrt_(F v)          { return sqrtf(v); }
round(F v,F scale)130     SI U32 round (F v, F scale) { return (uint32_t)(v*scale + 0.5f); }
pack(U32 v)131     SI U16 pack(U32 v)          { return (U16)v; }
pack(U16 v)132     SI U8  pack(U16 v)          { return  (U8)v; }
133 
if_then_else(I32 c,F t,F e)134     SI F if_then_else(I32 c, F t, F e) { return c ? t : e; }
135 
136     template <typename T>
gather(const T * p,U32 ix)137     SI T gather(const T* p, U32 ix) { return p[ix]; }
138 
load2(const uint16_t * ptr,size_t tail,U16 * r,U16 * g)139     SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
140         *r = ptr[0];
141         *g = ptr[1];
142     }
store2(uint16_t * ptr,size_t tail,U16 r,U16 g)143     SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
144         ptr[0] = r;
145         ptr[1] = g;
146     }
load3(const uint16_t * ptr,size_t tail,U16 * r,U16 * g,U16 * b)147     SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
148         *r = ptr[0];
149         *g = ptr[1];
150         *b = ptr[2];
151     }
load4(const uint16_t * ptr,size_t tail,U16 * r,U16 * g,U16 * b,U16 * a)152     SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
153         *r = ptr[0];
154         *g = ptr[1];
155         *b = ptr[2];
156         *a = ptr[3];
157     }
store4(uint16_t * ptr,size_t tail,U16 r,U16 g,U16 b,U16 a)158     SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
159         ptr[0] = r;
160         ptr[1] = g;
161         ptr[2] = b;
162         ptr[3] = a;
163     }
164 
load2(const float * ptr,size_t tail,F * r,F * g)165     SI void load2(const float* ptr, size_t tail, F* r, F* g) {
166         *r = ptr[0];
167         *g = ptr[1];
168     }
store2(float * ptr,size_t tail,F r,F g)169     SI void store2(float* ptr, size_t tail, F r, F g) {
170         ptr[0] = r;
171         ptr[1] = g;
172     }
load4(const float * ptr,size_t tail,F * r,F * g,F * b,F * a)173     SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
174         *r = ptr[0];
175         *g = ptr[1];
176         *b = ptr[2];
177         *a = ptr[3];
178     }
store4(float * ptr,size_t tail,F r,F g,F b,F a)179     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
180         ptr[0] = r;
181         ptr[1] = g;
182         ptr[2] = b;
183         ptr[3] = a;
184     }
185 
186 #elif defined(JUMPER_IS_NEON)
187     // Since we know we're using Clang, we can use its vector extensions.
188     template <typename T> using V = T __attribute__((ext_vector_type(4)));
189     using F   = V<float   >;
190     using I32 = V< int32_t>;
191     using U64 = V<uint64_t>;
192     using U32 = V<uint32_t>;
193     using U16 = V<uint16_t>;
194     using U8  = V<uint8_t >;
195 
196     // We polyfill a few routines that Clang doesn't build into ext_vector_types.
197     SI F   min(F a, F b)                         { return vminq_f32(a,b);          }
198     SI F   max(F a, F b)                         { return vmaxq_f32(a,b);          }
199     SI F   abs_  (F v)                           { return vabsq_f32(v);            }
200     SI F   rcp   (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e  ) * e; }
201     SI F   rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
202     SI U16 pack(U32 v)                           { return __builtin_convertvector(v, U16); }
203     SI U8  pack(U16 v)                           { return __builtin_convertvector(v,  U8); }
204 
205     SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
206 
207     #if defined(SK_CPU_ARM64)
208         SI F     mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
209         SI F  floor_(F v) { return vrndmq_f32(v); }
210         SI F   sqrt_(F v) { return vsqrtq_f32(v); }
211         SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
212     #else
213         SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); }
214         SI F floor_(F v) {
215             F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
216             return roundtrip - if_then_else(roundtrip > v, 1, 0);
217         }
218 
219         SI F sqrt_(F v) {
220             auto e = vrsqrteq_f32(v);  // Estimate and two refinement steps for e = rsqrt(v).
221             e *= vrsqrtsq_f32(v,e*e);
222             e *= vrsqrtsq_f32(v,e*e);
223             return v*e;                // sqrt(v) == v*rsqrt(v).
224         }
225 
226         SI U32 round(F v, F scale) {
227             return vcvtq_u32_f32(mad(v,scale,0.5f));
228         }
229     #endif
230 
231 
232     template <typename T>
233     SI V<T> gather(const T* p, U32 ix) {
234         return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
235     }
236     SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
237         uint16x4x2_t rg;
238         if (__builtin_expect(tail,0)) {
239             if (  true  ) { rg = vld2_lane_u16(ptr + 0, rg, 0); }
240             if (tail > 1) { rg = vld2_lane_u16(ptr + 2, rg, 1); }
241             if (tail > 2) { rg = vld2_lane_u16(ptr + 4, rg, 2); }
242         } else {
243             rg = vld2_u16(ptr);
244         }
245         *r = rg.val[0];
246         *g = rg.val[1];
247     }
248     SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
249         if (__builtin_expect(tail,0)) {
250             if (  true  ) { vst2_lane_u16(ptr + 0, (uint16x4x2_t{{r,g}}), 0); }
251             if (tail > 1) { vst2_lane_u16(ptr + 2, (uint16x4x2_t{{r,g}}), 1); }
252             if (tail > 2) { vst2_lane_u16(ptr + 4, (uint16x4x2_t{{r,g}}), 2); }
253         } else {
254             vst2_u16(ptr, (uint16x4x2_t{{r,g}}));
255         }
256     }
257     SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
258         uint16x4x3_t rgb;
259         if (__builtin_expect(tail,0)) {
260             if (  true  ) { rgb = vld3_lane_u16(ptr + 0, rgb, 0); }
261             if (tail > 1) { rgb = vld3_lane_u16(ptr + 3, rgb, 1); }
262             if (tail > 2) { rgb = vld3_lane_u16(ptr + 6, rgb, 2); }
263         } else {
264             rgb = vld3_u16(ptr);
265         }
266         *r = rgb.val[0];
267         *g = rgb.val[1];
268         *b = rgb.val[2];
269     }
270     SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
271         uint16x4x4_t rgba;
272         if (__builtin_expect(tail,0)) {
273             if (  true  ) { rgba = vld4_lane_u16(ptr + 0, rgba, 0); }
274             if (tail > 1) { rgba = vld4_lane_u16(ptr + 4, rgba, 1); }
275             if (tail > 2) { rgba = vld4_lane_u16(ptr + 8, rgba, 2); }
276         } else {
277             rgba = vld4_u16(ptr);
278         }
279         *r = rgba.val[0];
280         *g = rgba.val[1];
281         *b = rgba.val[2];
282         *a = rgba.val[3];
283     }
284 
285     SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
286         if (__builtin_expect(tail,0)) {
287             if (  true  ) { vst4_lane_u16(ptr + 0, (uint16x4x4_t{{r,g,b,a}}), 0); }
288             if (tail > 1) { vst4_lane_u16(ptr + 4, (uint16x4x4_t{{r,g,b,a}}), 1); }
289             if (tail > 2) { vst4_lane_u16(ptr + 8, (uint16x4x4_t{{r,g,b,a}}), 2); }
290         } else {
291             vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
292         }
293     }
294     SI void load2(const float* ptr, size_t tail, F* r, F* g) {
295         float32x4x2_t rg;
296         if (__builtin_expect(tail,0)) {
297             if (  true  ) { rg = vld2q_lane_f32(ptr + 0, rg, 0); }
298             if (tail > 1) { rg = vld2q_lane_f32(ptr + 2, rg, 1); }
299             if (tail > 2) { rg = vld2q_lane_f32(ptr + 4, rg, 2); }
300         } else {
301             rg = vld2q_f32(ptr);
302         }
303         *r = rg.val[0];
304         *g = rg.val[1];
305     }
306     SI void store2(float* ptr, size_t tail, F r, F g) {
307         if (__builtin_expect(tail,0)) {
308             if (  true  ) { vst2q_lane_f32(ptr + 0, (float32x4x2_t{{r,g}}), 0); }
309             if (tail > 1) { vst2q_lane_f32(ptr + 2, (float32x4x2_t{{r,g}}), 1); }
310             if (tail > 2) { vst2q_lane_f32(ptr + 4, (float32x4x2_t{{r,g}}), 2); }
311         } else {
312             vst2q_f32(ptr, (float32x4x2_t{{r,g}}));
313         }
314     }
315     SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
316         float32x4x4_t rgba;
317         if (__builtin_expect(tail,0)) {
318             if (  true  ) { rgba = vld4q_lane_f32(ptr + 0, rgba, 0); }
319             if (tail > 1) { rgba = vld4q_lane_f32(ptr + 4, rgba, 1); }
320             if (tail > 2) { rgba = vld4q_lane_f32(ptr + 8, rgba, 2); }
321         } else {
322             rgba = vld4q_f32(ptr);
323         }
324         *r = rgba.val[0];
325         *g = rgba.val[1];
326         *b = rgba.val[2];
327         *a = rgba.val[3];
328     }
329     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
330         if (__builtin_expect(tail,0)) {
331             if (  true  ) { vst4q_lane_f32(ptr + 0, (float32x4x4_t{{r,g,b,a}}), 0); }
332             if (tail > 1) { vst4q_lane_f32(ptr + 4, (float32x4x4_t{{r,g,b,a}}), 1); }
333             if (tail > 2) { vst4q_lane_f32(ptr + 8, (float32x4x4_t{{r,g,b,a}}), 2); }
334         } else {
335             vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
336         }
337     }
338 
339 #elif defined(JUMPER_IS_AVX) || defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
340     // These are __m256 and __m256i, but friendlier and strongly-typed.
341     template <typename T> using V = T __attribute__((ext_vector_type(8)));
342     using F   = V<float   >;
343     using I32 = V< int32_t>;
344     using U64 = V<uint64_t>;
345     using U32 = V<uint32_t>;
346     using U16 = V<uint16_t>;
347     using U8  = V<uint8_t >;
348 
349     SI F mad(F f, F m, F a)  {
350     #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
351         return _mm256_fmadd_ps(f,m,a);
352     #else
353         return f*m+a;
354     #endif
355     }
356 
357     SI F   min(F a, F b)        { return _mm256_min_ps(a,b);    }
358     SI F   max(F a, F b)        { return _mm256_max_ps(a,b);    }
359     SI F   abs_  (F v)          { return _mm256_and_ps(v, 0-v); }
360     SI F   floor_(F v)          { return _mm256_floor_ps(v);    }
361     SI F   rcp   (F v)          { return _mm256_rcp_ps  (v);    }
362     SI F   rsqrt (F v)          { return _mm256_rsqrt_ps(v);    }
363     SI F    sqrt_(F v)          { return _mm256_sqrt_ps (v);    }
364     SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
365 
366     SI U16 pack(U32 v) {
367         return _mm_packus_epi32(_mm256_extractf128_si256(v, 0),
368                                 _mm256_extractf128_si256(v, 1));
369     }
370     SI U8 pack(U16 v) {
371         auto r = _mm_packus_epi16(v,v);
372         return sk_unaligned_load<U8>(&r);
373     }
374 
375     SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
376 
377     template <typename T>
378     SI V<T> gather(const T* p, U32 ix) {
379         return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
380                  p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
381     }
382     #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
383         SI F   gather(const float*    p, U32 ix) { return _mm256_i32gather_ps   (p, ix, 4); }
384         SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); }
385         SI U64 gather(const uint64_t* p, U32 ix) {
386             __m256i parts[] = {
387                 _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,0), 8),
388                 _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,1), 8),
389             };
390             return bit_cast<U64>(parts);
391         }
392     #endif
393 
394     SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
395         U16 _0123, _4567;
396         if (__builtin_expect(tail,0)) {
397             _0123 = _4567 = _mm_setzero_si128();
398             auto* d = &_0123;
399             if (tail > 3) {
400                 *d = _mm_loadu_si128(((__m128i*)ptr) + 0);
401                 tail -= 4;
402                 ptr += 8;
403                 d = &_4567;
404             }
405             bool high = false;
406             if (tail > 1) {
407                 *d = _mm_loadu_si64(ptr);
408                 tail -= 2;
409                 ptr += 4;
410                 high = true;
411             }
412             if (tail > 0) {
413                 (*d)[high ? 4 : 0] = *(ptr + 0);
414                 (*d)[high ? 5 : 1] = *(ptr + 1);
415             }
416         } else {
417             _0123 = _mm_loadu_si128(((__m128i*)ptr) + 0);
418             _4567 = _mm_loadu_si128(((__m128i*)ptr) + 1);
419         }
420         *r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(_0123, 16), 16),
421                              _mm_srai_epi32(_mm_slli_epi32(_4567, 16), 16));
422         *g = _mm_packs_epi32(_mm_srai_epi32(_0123, 16),
423                              _mm_srai_epi32(_4567, 16));
424     }
425     SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
426         auto _0123 = _mm_unpacklo_epi16(r, g),
427              _4567 = _mm_unpackhi_epi16(r, g);
428         if (__builtin_expect(tail,0)) {
429             const auto* s = &_0123;
430             if (tail > 3) {
431                 _mm_storeu_si128((__m128i*)ptr, *s);
432                 s = &_4567;
433                 tail -= 4;
434                 ptr += 8;
435             }
436             bool high = false;
437             if (tail > 1) {
438                 _mm_storel_epi64((__m128i*)ptr, *s);
439                 ptr += 4;
440                 tail -= 2;
441                 high = true;
442             }
443             if (tail > 0) {
444                 if (high) {
445                     *(int32_t*)ptr = _mm_extract_epi32(*s, 2);
446                 } else {
447                     *(int32_t*)ptr = _mm_cvtsi128_si32(*s);
448                 }
449             }
450         } else {
451             _mm_storeu_si128((__m128i*)ptr + 0, _0123);
452             _mm_storeu_si128((__m128i*)ptr + 1, _4567);
453         }
454     }
455 
456     SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
457         __m128i _0,_1,_2,_3,_4,_5,_6,_7;
458         if (__builtin_expect(tail,0)) {
459             auto load_rgb = [](const uint16_t* src) {
460                 auto v = _mm_cvtsi32_si128(*(const uint32_t*)src);
461                 return _mm_insert_epi16(v, src[2], 2);
462             };
463             _1 = _2 = _3 = _4 = _5 = _6 = _7 = _mm_setzero_si128();
464             if (  true  ) { _0 = load_rgb(ptr +  0); }
465             if (tail > 1) { _1 = load_rgb(ptr +  3); }
466             if (tail > 2) { _2 = load_rgb(ptr +  6); }
467             if (tail > 3) { _3 = load_rgb(ptr +  9); }
468             if (tail > 4) { _4 = load_rgb(ptr + 12); }
469             if (tail > 5) { _5 = load_rgb(ptr + 15); }
470             if (tail > 6) { _6 = load_rgb(ptr + 18); }
471         } else {
472             // Load 0+1, 2+3, 4+5 normally, and 6+7 backed up 4 bytes so we don't run over.
473             auto _01 =                _mm_loadu_si128((const __m128i*)(ptr +  0))    ;
474             auto _23 =                _mm_loadu_si128((const __m128i*)(ptr +  6))    ;
475             auto _45 =                _mm_loadu_si128((const __m128i*)(ptr + 12))    ;
476             auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 16)), 4);
477             _0 = _01; _1 = _mm_srli_si128(_01, 6);
478             _2 = _23; _3 = _mm_srli_si128(_23, 6);
479             _4 = _45; _5 = _mm_srli_si128(_45, 6);
480             _6 = _67; _7 = _mm_srli_si128(_67, 6);
481         }
482 
483         auto _02 = _mm_unpacklo_epi16(_0, _2),  // r0 r2 g0 g2 b0 b2 xx xx
484              _13 = _mm_unpacklo_epi16(_1, _3),
485              _46 = _mm_unpacklo_epi16(_4, _6),
486              _57 = _mm_unpacklo_epi16(_5, _7);
487 
488         auto rg0123 = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
489              bx0123 = _mm_unpackhi_epi16(_02, _13),  // b0 b1 b2 b3 xx xx xx xx
490              rg4567 = _mm_unpacklo_epi16(_46, _57),
491              bx4567 = _mm_unpackhi_epi16(_46, _57);
492 
493         *r = _mm_unpacklo_epi64(rg0123, rg4567);
494         *g = _mm_unpackhi_epi64(rg0123, rg4567);
495         *b = _mm_unpacklo_epi64(bx0123, bx4567);
496     }
497     SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
498         __m128i _01, _23, _45, _67;
499         if (__builtin_expect(tail,0)) {
500             auto src = (const double*)ptr;
501             _01 = _23 = _45 = _67 = _mm_setzero_si128();
502             if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
503             if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
504             if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
505             if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
506             if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
507             if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
508             if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
509         } else {
510             _01 = _mm_loadu_si128(((__m128i*)ptr) + 0);
511             _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
512             _45 = _mm_loadu_si128(((__m128i*)ptr) + 2);
513             _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
514         }
515 
516         auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2
517              _13 = _mm_unpackhi_epi16(_01, _23),  // r1 r3 g1 g3 b1 b3 a1 a3
518              _46 = _mm_unpacklo_epi16(_45, _67),
519              _57 = _mm_unpackhi_epi16(_45, _67);
520 
521         auto rg0123 = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
522              ba0123 = _mm_unpackhi_epi16(_02, _13),  // b0 b1 b2 b3 a0 a1 a2 a3
523              rg4567 = _mm_unpacklo_epi16(_46, _57),
524              ba4567 = _mm_unpackhi_epi16(_46, _57);
525 
526         *r = _mm_unpacklo_epi64(rg0123, rg4567);
527         *g = _mm_unpackhi_epi64(rg0123, rg4567);
528         *b = _mm_unpacklo_epi64(ba0123, ba4567);
529         *a = _mm_unpackhi_epi64(ba0123, ba4567);
530     }
531     SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
532         auto rg0123 = _mm_unpacklo_epi16(r, g),  // r0 g0 r1 g1 r2 g2 r3 g3
533              rg4567 = _mm_unpackhi_epi16(r, g),  // r4 g4 r5 g5 r6 g6 r7 g7
534              ba0123 = _mm_unpacklo_epi16(b, a),
535              ba4567 = _mm_unpackhi_epi16(b, a);
536 
537         auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
538              _23 = _mm_unpackhi_epi32(rg0123, ba0123),
539              _45 = _mm_unpacklo_epi32(rg4567, ba4567),
540              _67 = _mm_unpackhi_epi32(rg4567, ba4567);
541 
542         if (__builtin_expect(tail,0)) {
543             auto dst = (double*)ptr;
544             if (tail > 0) { _mm_storel_pd(dst+0, _01); }
545             if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
546             if (tail > 2) { _mm_storel_pd(dst+2, _23); }
547             if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
548             if (tail > 4) { _mm_storel_pd(dst+4, _45); }
549             if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
550             if (tail > 6) { _mm_storel_pd(dst+6, _67); }
551         } else {
552             _mm_storeu_si128((__m128i*)ptr + 0, _01);
553             _mm_storeu_si128((__m128i*)ptr + 1, _23);
554             _mm_storeu_si128((__m128i*)ptr + 2, _45);
555             _mm_storeu_si128((__m128i*)ptr + 3, _67);
556         }
557     }
558 
559     SI void load2(const float* ptr, size_t tail, F* r, F* g) {
560         F _0123, _4567;
561         if (__builtin_expect(tail, 0)) {
562             _0123 = _4567 = _mm256_setzero_ps();
563             F* d = &_0123;
564             if (tail > 3) {
565                 *d = _mm256_loadu_ps(ptr);
566                 ptr += 8;
567                 tail -= 4;
568                 d = &_4567;
569             }
570             bool high = false;
571             if (tail > 1) {
572                 *d = _mm256_castps128_ps256(_mm_loadu_ps(ptr));
573                 ptr += 4;
574                 tail -= 2;
575                 high = true;
576             }
577             if (tail > 0) {
578                 *d = high ? _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 1)
579                           : _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 0);
580             }
581         } else {
582             _0123 = _mm256_loadu_ps(ptr + 0);
583             _4567 = _mm256_loadu_ps(ptr + 8);
584         }
585 
586         F _0145 = _mm256_permute2f128_pd(_0123, _4567, 0x20),
587           _2367 = _mm256_permute2f128_pd(_0123, _4567, 0x31);
588 
589         *r = _mm256_shuffle_ps(_0145, _2367, 0x88);
590         *g = _mm256_shuffle_ps(_0145, _2367, 0xDD);
591     }
592     SI void store2(float* ptr, size_t tail, F r, F g) {
593         F _0145 = _mm256_unpacklo_ps(r, g),
594           _2367 = _mm256_unpackhi_ps(r, g);
595         F _0123 = _mm256_permute2f128_pd(_0145, _2367, 0x20),
596           _4567 = _mm256_permute2f128_pd(_0145, _2367, 0x31);
597 
598         if (__builtin_expect(tail, 0)) {
599             const __m256* s = &_0123;
600             if (tail > 3) {
601                 _mm256_storeu_ps(ptr, *s);
602                 s = &_4567;
603                 tail -= 4;
604                 ptr += 8;
605             }
606             bool high = false;
607             if (tail > 1) {
608                 _mm_storeu_ps(ptr, _mm256_extractf128_ps(*s, 0));
609                 ptr += 4;
610                 tail -= 2;
611                 high = true;
612             }
613             if (tail > 0) {
614                 *(ptr + 0) = (*s)[ high ? 4 : 0];
615                 *(ptr + 1) = (*s)[ high ? 5 : 1];
616             }
617         } else {
618             _mm256_storeu_ps(ptr + 0, _0123);
619             _mm256_storeu_ps(ptr + 8, _4567);
620         }
621     }
622 
623     SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
624         F _04, _15, _26, _37;
625         _04 = _15 = _26 = _37 = 0;
626         switch (tail) {
627             case 0: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1);
628             case 7: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1);
629             case 6: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1);
630             case 5: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1);
631             case 4: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+12), 0);
632             case 3: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ 8), 0);
633             case 2: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ 4), 0);
634             case 1: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ 0), 0);
635         }
636 
637         F rg0145 = _mm256_unpacklo_ps(_04,_15),  // r0 r1 g0 g1 | r4 r5 g4 g5
638           ba0145 = _mm256_unpackhi_ps(_04,_15),
639           rg2367 = _mm256_unpacklo_ps(_26,_37),
640           ba2367 = _mm256_unpackhi_ps(_26,_37);
641 
642         *r = _mm256_unpacklo_pd(rg0145, rg2367);
643         *g = _mm256_unpackhi_pd(rg0145, rg2367);
644         *b = _mm256_unpacklo_pd(ba0145, ba2367);
645         *a = _mm256_unpackhi_pd(ba0145, ba2367);
646     }
647     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
648         F rg0145 = _mm256_unpacklo_ps(r, g),  // r0 g0 r1 g1 | r4 g4 r5 g5
649           rg2367 = _mm256_unpackhi_ps(r, g),  // r2 ...      | r6 ...
650           ba0145 = _mm256_unpacklo_ps(b, a),  // b0 a0 b1 a1 | b4 a4 b5 a5
651           ba2367 = _mm256_unpackhi_ps(b, a);  // b2 ...      | b6 ...
652 
653         F _04 = _mm256_unpacklo_pd(rg0145, ba0145),  // r0 g0 b0 a0 | r4 g4 b4 a4
654           _15 = _mm256_unpackhi_pd(rg0145, ba0145),  // r1 ...      | r5 ...
655           _26 = _mm256_unpacklo_pd(rg2367, ba2367),  // r2 ...      | r6 ...
656           _37 = _mm256_unpackhi_pd(rg2367, ba2367);  // r3 ...      | r7 ...
657 
658         if (__builtin_expect(tail, 0)) {
659             if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); }
660             if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); }
661             if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); }
662             if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); }
663             if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); }
664             if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); }
665             if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); }
666         } else {
667             F _01 = _mm256_permute2f128_ps(_04, _15, 32),  // 32 == 0010 0000 == lo, lo
668               _23 = _mm256_permute2f128_ps(_26, _37, 32),
669               _45 = _mm256_permute2f128_ps(_04, _15, 49),  // 49 == 0011 0001 == hi, hi
670               _67 = _mm256_permute2f128_ps(_26, _37, 49);
671             _mm256_storeu_ps(ptr+ 0, _01);
672             _mm256_storeu_ps(ptr+ 8, _23);
673             _mm256_storeu_ps(ptr+16, _45);
674             _mm256_storeu_ps(ptr+24, _67);
675         }
676     }
677 
678 #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41)
679     template <typename T> using V = T __attribute__((ext_vector_type(4)));
680     using F   = V<float   >;
681     using I32 = V< int32_t>;
682     using U64 = V<uint64_t>;
683     using U32 = V<uint32_t>;
684     using U16 = V<uint16_t>;
685     using U8  = V<uint8_t >;
686 
687     SI F   mad(F f, F m, F a)  { return f*m+a;              }
688     SI F   min(F a, F b)       { return _mm_min_ps(a,b);    }
689     SI F   max(F a, F b)       { return _mm_max_ps(a,b);    }
690     SI F   abs_(F v)           { return _mm_and_ps(v, 0-v); }
691     SI F   rcp   (F v)         { return _mm_rcp_ps  (v);    }
692     SI F   rsqrt (F v)         { return _mm_rsqrt_ps(v);    }
693     SI F    sqrt_(F v)         { return _mm_sqrt_ps (v);    }
694     SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
695 
696     SI U16 pack(U32 v) {
697     #if defined(JUMPER_IS_SSE41)
698         auto p = _mm_packus_epi32(v,v);
699     #else
700         // Sign extend so that _mm_packs_epi32() does the pack we want.
701         auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16);
702         p = _mm_packs_epi32(p,p);
703     #endif
704         return sk_unaligned_load<U16>(&p);  // We have two copies.  Return (the lower) one.
705     }
706     SI U8 pack(U16 v) {
707         auto r = widen_cast<__m128i>(v);
708         r = _mm_packus_epi16(r,r);
709         return sk_unaligned_load<U8>(&r);
710     }
711 
712     SI F if_then_else(I32 c, F t, F e) {
713         return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e));
714     }
715 
716     SI F floor_(F v) {
717     #if defined(JUMPER_IS_SSE41)
718         return _mm_floor_ps(v);
719     #else
720         F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
721         return roundtrip - if_then_else(roundtrip > v, 1, 0);
722     #endif
723     }
724 
725     template <typename T>
726     SI V<T> gather(const T* p, U32 ix) {
727         return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
728     }
729 
730     SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
731         __m128i _01;
732         if (__builtin_expect(tail,0)) {
733             _01 = _mm_setzero_si128();
734             if (tail > 1) {
735                 _01 = _mm_loadl_pd(_01, (const double*)ptr);            // r0 g0 r1 g1 00 00 00 00
736                 if (tail > 2) {
737                     _01 = _mm_loadh_pi(_01, (__m64 const* )(ptr + 4));  // r0 g0 r1 g1 r2 g2 00 00
738                 }
739             } else {
740                 _01 = _mm_loadl_pi(_01, (__m64 const*)ptr + 0);         // r0 g0 00 00 00 00 00 00
741             }
742         } else {
743             _01 = _mm_loadu_si128(((__m128i*)ptr) + 0);  // r0 g0 r1 g1 r2 g2 r3 g3
744         }
745         auto rg01_23 = _mm_shufflelo_epi16(_01, 0xD8);      // r0 r1 g0 g1 r2 g2 r3 g3
746         auto rg      = _mm_shufflehi_epi16(rg01_23, 0xD8);  // r0 r1 g0 g1 r2 r3 g2 g3
747 
748         auto R = _mm_shuffle_epi32(rg, 0x88);  // r0 r1 r2 r3 r0 r1 r2 r3
749         auto G = _mm_shuffle_epi32(rg, 0xDD);  // g0 g1 g2 g3 g0 g1 g2 g3
750         *r = sk_unaligned_load<U16>(&R);
751         *g = sk_unaligned_load<U16>(&G);
752     }
753     SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
754         U32 rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g));
755         if (__builtin_expect(tail, 0)) {
756             if (tail > 1) {
757                 _mm_storel_epi64((__m128i*)ptr, rg);
758                 if (tail > 2) {
759                     int32_t rgpair = rg[2];
760                     memcpy(ptr + 4, &rgpair, sizeof(rgpair));
761                 }
762             } else {
763                 int32_t rgpair = rg[0];
764                 memcpy(ptr, &rgpair, sizeof(rgpair));
765             }
766         } else {
767             _mm_storeu_si128((__m128i*)ptr + 0, rg);
768         }
769     }
770 
771     SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
772         __m128i _0, _1, _2, _3;
773         if (__builtin_expect(tail,0)) {
774             _1 = _2 = _3 = _mm_setzero_si128();
775             auto load_rgb = [](const uint16_t* src) {
776                 auto v = _mm_cvtsi32_si128(*(const uint32_t*)src);
777                 return _mm_insert_epi16(v, src[2], 2);
778             };
779             if (  true  ) { _0 = load_rgb(ptr + 0); }
780             if (tail > 1) { _1 = load_rgb(ptr + 3); }
781             if (tail > 2) { _2 = load_rgb(ptr + 6); }
782         } else {
783             // Load slightly weirdly to make sure we don't load past the end of 4x48 bits.
784             auto _01 =                _mm_loadu_si128((const __m128i*)(ptr + 0))    ,
785                  _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4);
786 
787             // Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored).
788             _0 = _01;
789             _1 = _mm_srli_si128(_01, 6);
790             _2 = _23;
791             _3 = _mm_srli_si128(_23, 6);
792         }
793 
794         // De-interlace to R,G,B.
795         auto _02 = _mm_unpacklo_epi16(_0, _2),  // r0 r2 g0 g2 b0 b2 xx xx
796              _13 = _mm_unpacklo_epi16(_1, _3);  // r1 r3 g1 g3 b1 b3 xx xx
797 
798         auto R = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
799              G = _mm_srli_si128(R, 8),
800              B = _mm_unpackhi_epi16(_02, _13);  // b0 b1 b2 b3 xx xx xx xx
801 
802         *r = sk_unaligned_load<U16>(&R);
803         *g = sk_unaligned_load<U16>(&G);
804         *b = sk_unaligned_load<U16>(&B);
805     }
806 
807     SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
808         __m128i _01, _23;
809         if (__builtin_expect(tail,0)) {
810             _01 = _23 = _mm_setzero_si128();
811             auto src = (const double*)ptr;
812             if (  true  ) { _01 = _mm_loadl_pd(_01, src + 0); } // r0 g0 b0 a0 00 00 00 00
813             if (tail > 1) { _01 = _mm_loadh_pd(_01, src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1
814             if (tail > 2) { _23 = _mm_loadl_pd(_23, src + 2); } // r2 g2 b2 a2 00 00 00 00
815         } else {
816             _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 b0 a0 r1 g1 b1 a1
817             _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3
818         }
819 
820         auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2
821              _13 = _mm_unpackhi_epi16(_01, _23);  // r1 r3 g1 g3 b1 b3 a1 a3
822 
823         auto rg = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
824              ba = _mm_unpackhi_epi16(_02, _13);  // b0 b1 b2 b3 a0 a1 a2 a3
825 
826         *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0);
827         *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4);
828         *b = sk_unaligned_load<U16>((uint16_t*)&ba + 0);
829         *a = sk_unaligned_load<U16>((uint16_t*)&ba + 4);
830     }
831 
832     SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
833         auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
834              ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a));
835 
836         if (__builtin_expect(tail, 0)) {
837             auto dst = (double*)ptr;
838             if (  true  ) { _mm_storel_pd(dst + 0, _mm_unpacklo_epi32(rg, ba)); }
839             if (tail > 1) { _mm_storeh_pd(dst + 1, _mm_unpacklo_epi32(rg, ba)); }
840             if (tail > 2) { _mm_storel_pd(dst + 2, _mm_unpackhi_epi32(rg, ba)); }
841         } else {
842             _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
843             _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
844         }
845     }
846 
847     SI void load2(const float* ptr, size_t tail, F* r, F* g) {
848         F _01, _23;
849         if (__builtin_expect(tail, 0)) {
850             _01 = _23 = _mm_setzero_si128();
851             if (  true  ) { _01 = _mm_loadl_pi(_01, (__m64 const*)(ptr + 0)); }
852             if (tail > 1) { _01 = _mm_loadh_pi(_01, (__m64 const*)(ptr + 2)); }
853             if (tail > 2) { _23 = _mm_loadl_pi(_23, (__m64 const*)(ptr + 4)); }
854         } else {
855             _01 = _mm_loadu_ps(ptr + 0);
856             _23 = _mm_loadu_ps(ptr + 4);
857         }
858         *r = _mm_shuffle_ps(_01, _23, 0x88);
859         *g = _mm_shuffle_ps(_01, _23, 0xDD);
860     }
861     SI void store2(float* ptr, size_t tail, F r, F g) {
862         F _01 = _mm_unpacklo_ps(r, g),
863           _23 = _mm_unpackhi_ps(r, g);
864         if (__builtin_expect(tail, 0)) {
865             if (  true  ) { _mm_storel_pi((__m64*)(ptr + 0), _01); }
866             if (tail > 1) { _mm_storeh_pi((__m64*)(ptr + 2), _01); }
867             if (tail > 2) { _mm_storel_pi((__m64*)(ptr + 4), _23); }
868         } else {
869             _mm_storeu_ps(ptr + 0, _01);
870             _mm_storeu_ps(ptr + 4, _23);
871         }
872     }
873 
874     SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
875         F _0, _1, _2, _3;
876         if (__builtin_expect(tail, 0)) {
877             _1 = _2 = _3 = _mm_setzero_si128();
878             if (  true  ) { _0 = _mm_loadu_ps(ptr + 0); }
879             if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); }
880             if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); }
881         } else {
882             _0 = _mm_loadu_ps(ptr + 0);
883             _1 = _mm_loadu_ps(ptr + 4);
884             _2 = _mm_loadu_ps(ptr + 8);
885             _3 = _mm_loadu_ps(ptr +12);
886         }
887         _MM_TRANSPOSE4_PS(_0,_1,_2,_3);
888         *r = _0;
889         *g = _1;
890         *b = _2;
891         *a = _3;
892     }
893 
894     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
895         _MM_TRANSPOSE4_PS(r,g,b,a);
896         if (__builtin_expect(tail, 0)) {
897             if (  true  ) { _mm_storeu_ps(ptr + 0, r); }
898             if (tail > 1) { _mm_storeu_ps(ptr + 4, g); }
899             if (tail > 2) { _mm_storeu_ps(ptr + 8, b); }
900         } else {
901             _mm_storeu_ps(ptr + 0, r);
902             _mm_storeu_ps(ptr + 4, g);
903             _mm_storeu_ps(ptr + 8, b);
904             _mm_storeu_ps(ptr +12, a);
905         }
906     }
907 #endif
908 
909 // We need to be a careful with casts.
910 // (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
911 // These named casts and bit_cast() are always what they seem to be.
912 #if defined(JUMPER_IS_SCALAR)
cast(U32 v)913     SI F   cast  (U32 v) { return   (F)v; }
cast64(U64 v)914     SI F   cast64(U64 v) { return   (F)v; }
trunc_(F v)915     SI U32 trunc_(F   v) { return (U32)v; }
expand(U16 v)916     SI U32 expand(U16 v) { return (U32)v; }
expand(U8 v)917     SI U32 expand(U8  v) { return (U32)v; }
918 #else
cast(U32 v)919     SI F   cast  (U32 v) { return      __builtin_convertvector((I32)v,   F); }
cast64(U64 v)920     SI F   cast64(U64 v) { return      __builtin_convertvector(     v,   F); }
trunc_(F v)921     SI U32 trunc_(F   v) { return (U32)__builtin_convertvector(     v, I32); }
expand(U16 v)922     SI U32 expand(U16 v) { return      __builtin_convertvector(     v, U32); }
expand(U8 v)923     SI U32 expand(U8  v) { return      __builtin_convertvector(     v, U32); }
924 #endif
925 
926 template <typename V>
if_then_else(I32 c,V t,V e)927 SI V if_then_else(I32 c, V t, V e) {
928     return bit_cast<V>(if_then_else(c, bit_cast<F>(t), bit_cast<F>(e)));
929 }
930 
bswap(U16 x)931 SI U16 bswap(U16 x) {
932 #if defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41)
933     // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes
934     // when generating code for SSE2 and SSE4.1.  We'll do it manually...
935     auto v = widen_cast<__m128i>(x);
936     v = _mm_slli_epi16(v,8) | _mm_srli_epi16(v,8);
937     return sk_unaligned_load<U16>(&v);
938 #else
939     return (x<<8) | (x>>8);
940 #endif
941 }
942 
fract(F v)943 SI F fract(F v) { return v - floor_(v); }
944 
945 // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
approx_log2(F x)946 SI F approx_log2(F x) {
947     // e - 127 is a fair approximation of log2(x) in its own right...
948     F e = cast(bit_cast<U32>(x)) * (1.0f / (1<<23));
949 
950     // ... but using the mantissa to refine its error is _much_ better.
951     F m = bit_cast<F>((bit_cast<U32>(x) & 0x007fffff) | 0x3f000000);
952     return e
953          - 124.225514990f
954          -   1.498030302f * m
955          -   1.725879990f / (0.3520887068f + m);
956 }
approx_pow2(F x)957 SI F approx_pow2(F x) {
958     F f = fract(x);
959     return bit_cast<F>(round(1.0f * (1<<23),
960                              x + 121.274057500f
961                                -   1.490129070f * f
962                                +  27.728023300f / (4.84252568f - f)));
963 }
964 
approx_powf(F x,F y)965 SI F approx_powf(F x, F y) {
966 #if defined(SK_LEGACY_APPROX_POWF_SPECIALCASE)
967     return if_then_else((x == 0)         , 0
968 #else
969     return if_then_else((x == 0)|(x == 1), x
970 #endif
971                                          , approx_pow2(approx_log2(x) * y));
972 }
973 
from_half(U16 h)974 SI F from_half(U16 h) {
975 #if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \
976     && !defined(SK_BUILD_FOR_GOOGLE3)  // Temporary workaround for some Google3 builds.
977     return vcvt_f32_f16(h);
978 
979 #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
980     return _mm256_cvtph_ps(h);
981 
982 #else
983     // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias.
984     U32 sem = expand(h),
985         s   = sem & 0x8000,
986          em = sem ^ s;
987 
988     // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero.
989     auto denorm = (I32)em < 0x0400;      // I32 comparison is often quicker, and always safe here.
990     return if_then_else(denorm, F(0)
991                               , bit_cast<F>( (s<<16) + (em<<13) + ((127-15)<<23) ));
992 #endif
993 }
994 
to_half(F f)995 SI U16 to_half(F f) {
996 #if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \
997     && !defined(SK_BUILD_FOR_GOOGLE3)  // Temporary workaround for some Google3 builds.
998     return vcvt_f16_f32(f);
999 
1000 #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
1001     return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
1002 
1003 #else
1004     // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias.
1005     U32 sem = bit_cast<U32>(f),
1006         s   = sem & 0x80000000,
1007          em = sem ^ s;
1008 
1009     // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero.
1010     auto denorm = (I32)em < 0x38800000;  // I32 comparison is often quicker, and always safe here.
1011     return pack(if_then_else(denorm, U32(0)
1012                                    , (s>>16) + (em>>13) - ((127-15)<<10)));
1013 #endif
1014 }
1015 
1016 // Our fundamental vector depth is our pixel stride.
1017 static const size_t N = sizeof(F) / sizeof(float);
1018 
1019 // We're finally going to get to what a Stage function looks like!
1020 //    tail == 0 ~~> work on a full N pixels
1021 //    tail != 0 ~~> work on only the first tail pixels
1022 // tail is always < N.
1023 
1024 // Any custom ABI to use for all (non-externally-facing) stage functions?
1025 // Also decide here whether to use narrow (compromise) or wide (ideal) stages.
1026 #if defined(SK_CPU_ARM32) && defined(JUMPER_IS_NEON)
1027     // This lets us pass vectors more efficiently on 32-bit ARM.
1028     // We can still only pass 16 floats, so best as 4x {r,g,b,a}.
1029     #define ABI __attribute__((pcs("aapcs-vfp")))
1030     #define JUMPER_NARROW_STAGES 1
1031 #elif 0 && defined(_MSC_VER) && defined(__clang__) && defined(__x86_64__)
1032     // SysV ABI makes it very sensible to use wide stages with clang-cl.
1033     // TODO: crashes during compilation  :(
1034     #define ABI __attribute__((sysv_abi))
1035     #define JUMPER_NARROW_STAGES 0
1036 #elif defined(_MSC_VER)
1037     // Even if not vectorized, this lets us pass {r,g,b,a} as registers,
1038     // instead of {b,a} on the stack.  Narrow stages work best for __vectorcall.
1039     #define ABI __vectorcall
1040     #define JUMPER_NARROW_STAGES 1
1041 #elif defined(__x86_64__) || defined(SK_CPU_ARM64)
1042     // These platforms are ideal for wider stages, and their default ABI is ideal.
1043     #define ABI
1044     #define JUMPER_NARROW_STAGES 0
1045 #else
1046     // 32-bit or unknown... shunt them down the narrow path.
1047     // Odds are these have few registers and are better off there.
1048     #define ABI
1049     #define JUMPER_NARROW_STAGES 1
1050 #endif
1051 
1052 #if JUMPER_NARROW_STAGES
1053     struct Params {
1054         size_t dx, dy, tail;
1055         F dr,dg,db,da;
1056     };
1057     using Stage = void(ABI*)(Params*, void** program, F r, F g, F b, F a);
1058 #else
1059     // We keep program the second argument, so that it's passed in rsi for load_and_inc().
1060     using Stage = void(ABI*)(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F);
1061 #endif
1062 
1063 
start_pipeline(size_t dx,size_t dy,size_t xlimit,size_t ylimit,void ** program)1064 static void start_pipeline(size_t dx, size_t dy, size_t xlimit, size_t ylimit, void** program) {
1065     auto start = (Stage)load_and_inc(program);
1066     const size_t x0 = dx;
1067     for (; dy < ylimit; dy++) {
1068     #if JUMPER_NARROW_STAGES
1069         Params params = { x0,dy,0, 0,0,0,0 };
1070         while (params.dx + N <= xlimit) {
1071             start(&params,program, 0,0,0,0);
1072             params.dx += N;
1073         }
1074         if (size_t tail = xlimit - params.dx) {
1075             params.tail = tail;
1076             start(&params,program, 0,0,0,0);
1077         }
1078     #else
1079         dx = x0;
1080         while (dx + N <= xlimit) {
1081             start(0,program,dx,dy,    0,0,0,0, 0,0,0,0);
1082             dx += N;
1083         }
1084         if (size_t tail = xlimit - dx) {
1085             start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
1086         }
1087     #endif
1088     }
1089 }
1090 
1091 #if JUMPER_NARROW_STAGES
1092     #define STAGE(name, ...)                                                    \
1093         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,        \
1094                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);   \
1095         static void ABI name(Params* params, void** program,                    \
1096                              F r, F g, F b, F a) {                              \
1097             name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a,  \
1098                      params->dr, params->dg, params->db, params->da);           \
1099             auto next = (Stage)load_and_inc(program);                           \
1100             next(params,program, r,g,b,a);                                      \
1101         }                                                                       \
1102         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,        \
1103                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1104 #else
1105     #define STAGE(name, ...)                                                         \
1106         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,             \
1107                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);        \
1108         static void ABI name(size_t tail, void** program, size_t dx, size_t dy,      \
1109                              F r, F g, F b, F a, F dr, F dg, F db, F da) {           \
1110             name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da);                 \
1111             auto next = (Stage)load_and_inc(program);                                \
1112             next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                          \
1113         }                                                                            \
1114         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,             \
1115                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1116 #endif
1117 
1118 
1119 // just_return() is a simple no-op stage that only exists to end the chain,
1120 // returning back up to start_pipeline(), and from there to the caller.
1121 #if JUMPER_NARROW_STAGES
just_return(Params *,void **,F,F,F,F)1122     static void ABI just_return(Params*, void**, F,F,F,F) {}
1123 #else
just_return(size_t,void **,size_t,size_t,F,F,F,F,F,F,F,F)1124     static void ABI just_return(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {}
1125 #endif
1126 
1127 
1128 // We could start defining normal Stages now.  But first, some helper functions.
1129 
1130 // These load() and store() methods are tail-aware,
1131 // but focus mainly on keeping the at-stride tail==0 case fast.
1132 
1133 template <typename V, typename T>
load(const T * src,size_t tail)1134 SI V load(const T* src, size_t tail) {
1135 #if !defined(JUMPER_IS_SCALAR)
1136     __builtin_assume(tail < N);
1137     if (__builtin_expect(tail, 0)) {
1138         V v{};  // Any inactive lanes are zeroed.
1139         switch (tail) {
1140             case 7: v[6] = src[6];
1141             case 6: v[5] = src[5];
1142             case 5: v[4] = src[4];
1143             case 4: memcpy(&v, src, 4*sizeof(T)); break;
1144             case 3: v[2] = src[2];
1145             case 2: memcpy(&v, src, 2*sizeof(T)); break;
1146             case 1: memcpy(&v, src, 1*sizeof(T)); break;
1147         }
1148         return v;
1149     }
1150 #endif
1151     return sk_unaligned_load<V>(src);
1152 }
1153 
1154 template <typename V, typename T>
store(T * dst,V v,size_t tail)1155 SI void store(T* dst, V v, size_t tail) {
1156 #if !defined(JUMPER_IS_SCALAR)
1157     __builtin_assume(tail < N);
1158     if (__builtin_expect(tail, 0)) {
1159         switch (tail) {
1160             case 7: dst[6] = v[6];
1161             case 6: dst[5] = v[5];
1162             case 5: dst[4] = v[4];
1163             case 4: memcpy(dst, &v, 4*sizeof(T)); break;
1164             case 3: dst[2] = v[2];
1165             case 2: memcpy(dst, &v, 2*sizeof(T)); break;
1166             case 1: memcpy(dst, &v, 1*sizeof(T)); break;
1167         }
1168         return;
1169     }
1170 #endif
1171     sk_unaligned_store(dst, v);
1172 }
1173 
from_byte(U8 b)1174 SI F from_byte(U8 b) {
1175     return cast(expand(b)) * (1/255.0f);
1176 }
from_short(U16 s)1177 SI F from_short(U16 s) {
1178     return cast(expand(s)) * (1/65535.0f);
1179 }
from_565(U16 _565,F * r,F * g,F * b)1180 SI void from_565(U16 _565, F* r, F* g, F* b) {
1181     U32 wide = expand(_565);
1182     *r = cast(wide & (31<<11)) * (1.0f / (31<<11));
1183     *g = cast(wide & (63<< 5)) * (1.0f / (63<< 5));
1184     *b = cast(wide & (31<< 0)) * (1.0f / (31<< 0));
1185 }
from_4444(U16 _4444,F * r,F * g,F * b,F * a)1186 SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) {
1187     U32 wide = expand(_4444);
1188     *r = cast(wide & (15<<12)) * (1.0f / (15<<12));
1189     *g = cast(wide & (15<< 8)) * (1.0f / (15<< 8));
1190     *b = cast(wide & (15<< 4)) * (1.0f / (15<< 4));
1191     *a = cast(wide & (15<< 0)) * (1.0f / (15<< 0));
1192 }
from_8888(U32 _8888,F * r,F * g,F * b,F * a)1193 SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) {
1194     *r = cast((_8888      ) & 0xff) * (1/255.0f);
1195     *g = cast((_8888 >>  8) & 0xff) * (1/255.0f);
1196     *b = cast((_8888 >> 16) & 0xff) * (1/255.0f);
1197     *a = cast((_8888 >> 24)       ) * (1/255.0f);
1198 }
from_88(U16 _88,F * r,F * g)1199 SI void from_88(U16 _88, F* r, F* g) {
1200     U32 wide = expand(_88);
1201     *r = cast((wide      ) & 0xff) * (1/255.0f);
1202     *g = cast((wide >>  8) & 0xff) * (1/255.0f);
1203 }
from_1010102(U32 rgba,F * r,F * g,F * b,F * a)1204 SI void from_1010102(U32 rgba, F* r, F* g, F* b, F* a) {
1205     *r = cast((rgba      ) & 0x3ff) * (1/1023.0f);
1206     *g = cast((rgba >> 10) & 0x3ff) * (1/1023.0f);
1207     *b = cast((rgba >> 20) & 0x3ff) * (1/1023.0f);
1208     *a = cast((rgba >> 30)        ) * (1/   3.0f);
1209 }
from_1616(U32 _1616,F * r,F * g)1210 SI void from_1616(U32 _1616, F* r, F* g) {
1211     *r = cast((_1616      ) & 0xffff) * (1/65535.0f);
1212     *g = cast((_1616 >> 16) & 0xffff) * (1/65535.0f);
1213 }
from_16161616(U64 _16161616,F * r,F * g,F * b,F * a)1214 SI void from_16161616(U64 _16161616, F* r, F* g, F* b, F* a) {
1215     *r = cast64((_16161616      ) & 0xffff) * (1/65535.0f);
1216     *g = cast64((_16161616 >> 16) & 0xffff) * (1/65535.0f);
1217     *b = cast64((_16161616 >> 32) & 0xffff) * (1/65535.0f);
1218     *a = cast64((_16161616 >> 48) & 0xffff) * (1/65535.0f);
1219 }
1220 
1221 // Used by load_ and store_ stages to get to the right (dx,dy) starting point of contiguous memory.
1222 template <typename T>
ptr_at_xy(const SkRasterPipeline_MemoryCtx * ctx,size_t dx,size_t dy)1223 SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) {
1224     return (T*)ctx->pixels + dy*ctx->stride + dx;
1225 }
1226 
1227 // clamp v to [0,limit).
clamp(F v,F limit)1228 SI F clamp(F v, F limit) {
1229     F inclusive = bit_cast<F>( bit_cast<U32>(limit) - 1 );  // Exclusive -> inclusive.
1230     return min(max(0, v), inclusive);
1231 }
1232 
1233 // Used by gather_ stages to calculate the base pointer and a vector of indices to load.
1234 template <typename T>
ix_and_ptr(T ** ptr,const SkRasterPipeline_GatherCtx * ctx,F x,F y)1235 SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
1236     x = clamp(x, ctx->width);
1237     y = clamp(y, ctx->height);
1238 
1239     *ptr = (const T*)ctx->pixels;
1240     return trunc_(y)*ctx->stride + trunc_(x);
1241 }
1242 
1243 // We often have a nominally [0,1] float value we need to scale and convert to an integer,
1244 // whether for a table lookup or to pack back down into bytes for storage.
1245 //
1246 // In practice, especially when dealing with interesting color spaces, that notionally
1247 // [0,1] float may be out of [0,1] range.  Unorms cannot represent that, so we must clamp.
1248 //
1249 // You can adjust the expected input to [0,bias] by tweaking that parameter.
1250 SI U32 to_unorm(F v, F scale, F bias = 1.0f) {
1251     // TODO: platform-specific implementations to to_unorm(), removing round() entirely?
1252     // Any time we use round() we probably want to use to_unorm().
1253     return round(min(max(0, v), bias), scale);
1254 }
1255 
cond_to_mask(I32 cond)1256 SI I32 cond_to_mask(I32 cond) { return if_then_else(cond, I32(~0), I32(0)); }
1257 
1258 // Now finally, normal Stages!
1259 
STAGE(seed_shader,Ctx::None)1260 STAGE(seed_shader, Ctx::None) {
1261     static const float iota[] = {
1262         0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
1263         8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
1264     };
1265     // It's important for speed to explicitly cast(dx) and cast(dy),
1266     // which has the effect of splatting them to vectors before converting to floats.
1267     // On Intel this breaks a data dependency on previous loop iterations' registers.
1268     r = cast(dx) + sk_unaligned_load<F>(iota);
1269     g = cast(dy) + 0.5f;
1270     b = 1.0f;
1271     a = 0;
1272     dr = dg = db = da = 0;
1273 }
1274 
STAGE(dither,const float * rate)1275 STAGE(dither, const float* rate) {
1276     // Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors.
1277     uint32_t iota[] = {0,1,2,3,4,5,6,7};
1278     U32 X = dx + sk_unaligned_load<U32>(iota),
1279         Y = dy;
1280 
1281     // We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering.
1282     // In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ].
1283 
1284     // We only need X and X^Y from here on, so it's easier to just think of that as "Y".
1285     Y ^= X;
1286 
1287     // We'll mix the bottom 3 bits of each of X and Y to make 6 bits,
1288     // for 2^6 == 64 == 8x8 matrix values.  If X=abc and Y=def, we make fcebda.
1289     U32 M = (Y & 1) << 5 | (X & 1) << 4
1290           | (Y & 2) << 2 | (X & 2) << 1
1291           | (Y & 4) >> 1 | (X & 4) >> 2;
1292 
1293     // Scale that dither to [0,1), then (-0.5,+0.5), here using 63/128 = 0.4921875 as 0.5-epsilon.
1294     // We want to make sure our dither is less than 0.5 in either direction to keep exact values
1295     // like 0 and 1 unchanged after rounding.
1296     F dither = cast(M) * (2/128.0f) - (63/128.0f);
1297 
1298     r += *rate*dither;
1299     g += *rate*dither;
1300     b += *rate*dither;
1301 
1302     r = max(0, min(r, a));
1303     g = max(0, min(g, a));
1304     b = max(0, min(b, a));
1305 }
1306 
1307 // load 4 floats from memory, and splat them into r,g,b,a
STAGE(uniform_color,const SkRasterPipeline_UniformColorCtx * c)1308 STAGE(uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
1309     r = c->r;
1310     g = c->g;
1311     b = c->b;
1312     a = c->a;
1313 }
STAGE(unbounded_uniform_color,const SkRasterPipeline_UniformColorCtx * c)1314 STAGE(unbounded_uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
1315     r = c->r;
1316     g = c->g;
1317     b = c->b;
1318     a = c->a;
1319 }
1320 // load 4 floats from memory, and splat them into dr,dg,db,da
STAGE(uniform_color_dst,const SkRasterPipeline_UniformColorCtx * c)1321 STAGE(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
1322     dr = c->r;
1323     dg = c->g;
1324     db = c->b;
1325     da = c->a;
1326 }
1327 
1328 // splats opaque-black into r,g,b,a
STAGE(black_color,Ctx::None)1329 STAGE(black_color, Ctx::None) {
1330     r = g = b = 0.0f;
1331     a = 1.0f;
1332 }
1333 
STAGE(white_color,Ctx::None)1334 STAGE(white_color, Ctx::None) {
1335     r = g = b = a = 1.0f;
1336 }
1337 
1338 // load registers r,g,b,a from context (mirrors store_rgba)
STAGE(load_src,const float * ptr)1339 STAGE(load_src, const float* ptr) {
1340     r = sk_unaligned_load<F>(ptr + 0*N);
1341     g = sk_unaligned_load<F>(ptr + 1*N);
1342     b = sk_unaligned_load<F>(ptr + 2*N);
1343     a = sk_unaligned_load<F>(ptr + 3*N);
1344 }
1345 
1346 // store registers r,g,b,a into context (mirrors load_rgba)
STAGE(store_src,float * ptr)1347 STAGE(store_src, float* ptr) {
1348     sk_unaligned_store(ptr + 0*N, r);
1349     sk_unaligned_store(ptr + 1*N, g);
1350     sk_unaligned_store(ptr + 2*N, b);
1351     sk_unaligned_store(ptr + 3*N, a);
1352 }
1353 
1354 // load registers dr,dg,db,da from context (mirrors store_dst)
STAGE(load_dst,const float * ptr)1355 STAGE(load_dst, const float* ptr) {
1356     dr = sk_unaligned_load<F>(ptr + 0*N);
1357     dg = sk_unaligned_load<F>(ptr + 1*N);
1358     db = sk_unaligned_load<F>(ptr + 2*N);
1359     da = sk_unaligned_load<F>(ptr + 3*N);
1360 }
1361 
1362 // store registers dr,dg,db,da into context (mirrors load_dst)
STAGE(store_dst,float * ptr)1363 STAGE(store_dst, float* ptr) {
1364     sk_unaligned_store(ptr + 0*N, dr);
1365     sk_unaligned_store(ptr + 1*N, dg);
1366     sk_unaligned_store(ptr + 2*N, db);
1367     sk_unaligned_store(ptr + 3*N, da);
1368 }
1369 
1370 // Most blend modes apply the same logic to each channel.
1371 #define BLEND_MODE(name)                       \
1372     SI F name##_channel(F s, F d, F sa, F da); \
1373     STAGE(name, Ctx::None) {                   \
1374         r = name##_channel(r,dr,a,da);         \
1375         g = name##_channel(g,dg,a,da);         \
1376         b = name##_channel(b,db,a,da);         \
1377         a = name##_channel(a,da,a,da);         \
1378     }                                          \
1379     SI F name##_channel(F s, F d, F sa, F da)
1380 
inv(F x)1381 SI F inv(F x) { return 1.0f - x; }
two(F x)1382 SI F two(F x) { return x + x; }
1383 
1384 
BLEND_MODE(clear)1385 BLEND_MODE(clear)    { return 0; }
BLEND_MODE(srcatop)1386 BLEND_MODE(srcatop)  { return s*da + d*inv(sa); }
BLEND_MODE(dstatop)1387 BLEND_MODE(dstatop)  { return d*sa + s*inv(da); }
BLEND_MODE(srcin)1388 BLEND_MODE(srcin)    { return s * da; }
BLEND_MODE(dstin)1389 BLEND_MODE(dstin)    { return d * sa; }
BLEND_MODE(srcout)1390 BLEND_MODE(srcout)   { return s * inv(da); }
BLEND_MODE(dstout)1391 BLEND_MODE(dstout)   { return d * inv(sa); }
BLEND_MODE(srcover)1392 BLEND_MODE(srcover)  { return mad(d, inv(sa), s); }
BLEND_MODE(dstover)1393 BLEND_MODE(dstover)  { return mad(s, inv(da), d); }
1394 
BLEND_MODE(modulate)1395 BLEND_MODE(modulate) { return s*d; }
BLEND_MODE(multiply)1396 BLEND_MODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; }
BLEND_MODE(plus_)1397 BLEND_MODE(plus_)    { return min(s + d, 1.0f); }  // We can clamp to either 1 or sa.
BLEND_MODE(screen)1398 BLEND_MODE(screen)   { return s + d - s*d; }
BLEND_MODE(xor_)1399 BLEND_MODE(xor_)     { return s*inv(da) + d*inv(sa); }
1400 #undef BLEND_MODE
1401 
1402 // Most other blend modes apply the same logic to colors, and srcover to alpha.
1403 #define BLEND_MODE(name)                       \
1404     SI F name##_channel(F s, F d, F sa, F da); \
1405     STAGE(name, Ctx::None) {                   \
1406         r = name##_channel(r,dr,a,da);         \
1407         g = name##_channel(g,dg,a,da);         \
1408         b = name##_channel(b,db,a,da);         \
1409         a = mad(da, inv(a), a);                \
1410     }                                          \
1411     SI F name##_channel(F s, F d, F sa, F da)
1412 
BLEND_MODE(darken)1413 BLEND_MODE(darken)     { return s + d -     max(s*da, d*sa) ; }
BLEND_MODE(lighten)1414 BLEND_MODE(lighten)    { return s + d -     min(s*da, d*sa) ; }
BLEND_MODE(difference)1415 BLEND_MODE(difference) { return s + d - two(min(s*da, d*sa)); }
BLEND_MODE(exclusion)1416 BLEND_MODE(exclusion)  { return s + d - two(s*d); }
1417 
BLEND_MODE(colorburn)1418 BLEND_MODE(colorburn) {
1419     return if_then_else(d == da,    d +    s*inv(da),
1420            if_then_else(s ==  0, /* s + */ d*inv(sa),
1421                                  sa*(da - min(da, (da-d)*sa*rcp(s))) + s*inv(da) + d*inv(sa)));
1422 }
BLEND_MODE(colordodge)1423 BLEND_MODE(colordodge) {
1424     return if_then_else(d ==  0, /* d + */ s*inv(da),
1425            if_then_else(s == sa,    s +    d*inv(sa),
1426                                  sa*min(da, (d*sa)*rcp(sa - s)) + s*inv(da) + d*inv(sa)));
1427 }
BLEND_MODE(hardlight)1428 BLEND_MODE(hardlight) {
1429     return s*inv(da) + d*inv(sa)
1430          + if_then_else(two(s) <= sa, two(s*d), sa*da - two((da-d)*(sa-s)));
1431 }
BLEND_MODE(overlay)1432 BLEND_MODE(overlay) {
1433     return s*inv(da) + d*inv(sa)
1434          + if_then_else(two(d) <= da, two(s*d), sa*da - two((da-d)*(sa-s)));
1435 }
1436 
BLEND_MODE(softlight)1437 BLEND_MODE(softlight) {
1438     F m  = if_then_else(da > 0, d / da, 0),
1439       s2 = two(s),
1440       m4 = two(two(m));
1441 
1442     // The logic forks three ways:
1443     //    1. dark src?
1444     //    2. light src, dark dst?
1445     //    3. light src, light dst?
1446     F darkSrc = d*(sa + (s2 - sa)*(1.0f - m)),     // Used in case 1.
1447       darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m,  // Used in case 2.
1448       liteDst = rcp(rsqrt(m)) - m,                 // Used in case 3.
1449       liteSrc = d*sa + da*(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3?
1450     return s*inv(da) + d*inv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc);      // 1 or (2 or 3)?
1451 }
1452 #undef BLEND_MODE
1453 
1454 // We're basing our implemenation of non-separable blend modes on
1455 //   https://www.w3.org/TR/compositing-1/#blendingnonseparable.
1456 // and
1457 //   https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
1458 // They're equivalent, but ES' math has been better simplified.
1459 //
1460 // Anything extra we add beyond that is to make the math work with premul inputs.
1461 
max(F r,F g,F b)1462 SI F max(F r, F g, F b) { return max(r, max(g, b)); }
min(F r,F g,F b)1463 SI F min(F r, F g, F b) { return min(r, min(g, b)); }
1464 
sat(F r,F g,F b)1465 SI F sat(F r, F g, F b) { return max(r,g,b) - min(r,g,b); }
lum(F r,F g,F b)1466 SI F lum(F r, F g, F b) { return r*0.30f + g*0.59f + b*0.11f; }
1467 
set_sat(F * r,F * g,F * b,F s)1468 SI void set_sat(F* r, F* g, F* b, F s) {
1469     F mn  = min(*r,*g,*b),
1470       mx  = max(*r,*g,*b),
1471       sat = mx - mn;
1472 
1473     // Map min channel to 0, max channel to s, and scale the middle proportionally.
1474     auto scale = [=](F c) {
1475         return if_then_else(sat == 0, 0, (c - mn) * s / sat);
1476     };
1477     *r = scale(*r);
1478     *g = scale(*g);
1479     *b = scale(*b);
1480 }
set_lum(F * r,F * g,F * b,F l)1481 SI void set_lum(F* r, F* g, F* b, F l) {
1482     F diff = l - lum(*r, *g, *b);
1483     *r += diff;
1484     *g += diff;
1485     *b += diff;
1486 }
clip_color(F * r,F * g,F * b,F a)1487 SI void clip_color(F* r, F* g, F* b, F a) {
1488     F mn = min(*r, *g, *b),
1489       mx = max(*r, *g, *b),
1490       l  = lum(*r, *g, *b);
1491 
1492     auto clip = [=](F c) {
1493         c = if_then_else(mn >= 0, c, l + (c - l) * (    l) / (l - mn)   );
1494         c = if_then_else(mx >  a,    l + (c - l) * (a - l) / (mx - l), c);
1495         c = max(c, 0);  // Sometimes without this we may dip just a little negative.
1496         return c;
1497     };
1498     *r = clip(*r);
1499     *g = clip(*g);
1500     *b = clip(*b);
1501 }
1502 
STAGE(hue,Ctx::None)1503 STAGE(hue, Ctx::None) {
1504     F R = r*a,
1505       G = g*a,
1506       B = b*a;
1507 
1508     set_sat(&R, &G, &B, sat(dr,dg,db)*a);
1509     set_lum(&R, &G, &B, lum(dr,dg,db)*a);
1510     clip_color(&R,&G,&B, a*da);
1511 
1512     r = r*inv(da) + dr*inv(a) + R;
1513     g = g*inv(da) + dg*inv(a) + G;
1514     b = b*inv(da) + db*inv(a) + B;
1515     a = a + da - a*da;
1516 }
STAGE(saturation,Ctx::None)1517 STAGE(saturation, Ctx::None) {
1518     F R = dr*a,
1519       G = dg*a,
1520       B = db*a;
1521 
1522     set_sat(&R, &G, &B, sat( r, g, b)*da);
1523     set_lum(&R, &G, &B, lum(dr,dg,db)* a);  // (This is not redundant.)
1524     clip_color(&R,&G,&B, a*da);
1525 
1526     r = r*inv(da) + dr*inv(a) + R;
1527     g = g*inv(da) + dg*inv(a) + G;
1528     b = b*inv(da) + db*inv(a) + B;
1529     a = a + da - a*da;
1530 }
STAGE(color,Ctx::None)1531 STAGE(color, Ctx::None) {
1532     F R = r*da,
1533       G = g*da,
1534       B = b*da;
1535 
1536     set_lum(&R, &G, &B, lum(dr,dg,db)*a);
1537     clip_color(&R,&G,&B, a*da);
1538 
1539     r = r*inv(da) + dr*inv(a) + R;
1540     g = g*inv(da) + dg*inv(a) + G;
1541     b = b*inv(da) + db*inv(a) + B;
1542     a = a + da - a*da;
1543 }
STAGE(luminosity,Ctx::None)1544 STAGE(luminosity, Ctx::None) {
1545     F R = dr*a,
1546       G = dg*a,
1547       B = db*a;
1548 
1549     set_lum(&R, &G, &B, lum(r,g,b)*da);
1550     clip_color(&R,&G,&B, a*da);
1551 
1552     r = r*inv(da) + dr*inv(a) + R;
1553     g = g*inv(da) + dg*inv(a) + G;
1554     b = b*inv(da) + db*inv(a) + B;
1555     a = a + da - a*da;
1556 }
1557 
STAGE(srcover_rgba_8888,const SkRasterPipeline_MemoryCtx * ctx)1558 STAGE(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
1559     auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
1560 
1561     U32 dst = load<U32>(ptr, tail);
1562     dr = cast((dst      ) & 0xff);
1563     dg = cast((dst >>  8) & 0xff);
1564     db = cast((dst >> 16) & 0xff);
1565     da = cast((dst >> 24)       );
1566     // {dr,dg,db,da} are in [0,255]
1567     // { r, g, b, a} are in [0,  1] (but may be out of gamut)
1568 
1569     r = mad(dr, inv(a), r*255.0f);
1570     g = mad(dg, inv(a), g*255.0f);
1571     b = mad(db, inv(a), b*255.0f);
1572     a = mad(da, inv(a), a*255.0f);
1573     // { r, g, b, a} are now in [0,255]  (but may be out of gamut)
1574 
1575     // to_unorm() clamps back to gamut.  Scaling by 1 since we're already 255-biased.
1576     dst = to_unorm(r, 1, 255)
1577         | to_unorm(g, 1, 255) <<  8
1578         | to_unorm(b, 1, 255) << 16
1579         | to_unorm(a, 1, 255) << 24;
1580     store(ptr, dst, tail);
1581 }
1582 
STAGE(clamp_0,Ctx::None)1583 STAGE(clamp_0, Ctx::None) {
1584     r = max(r, 0);
1585     g = max(g, 0);
1586     b = max(b, 0);
1587     a = max(a, 0);
1588 }
1589 
STAGE(clamp_1,Ctx::None)1590 STAGE(clamp_1, Ctx::None) {
1591     r = min(r, 1.0f);
1592     g = min(g, 1.0f);
1593     b = min(b, 1.0f);
1594     a = min(a, 1.0f);
1595 }
1596 
STAGE(clamp_a,Ctx::None)1597 STAGE(clamp_a, Ctx::None) {
1598     a = min(a, 1.0f);
1599     r = min(r, a);
1600     g = min(g, a);
1601     b = min(b, a);
1602 }
1603 
STAGE(clamp_gamut,Ctx::None)1604 STAGE(clamp_gamut, Ctx::None) {
1605     // If you're using this stage, a should already be in [0,1].
1606     r = min(max(r, 0), a);
1607     g = min(max(g, 0), a);
1608     b = min(max(b, 0), a);
1609 }
1610 
STAGE(set_rgb,const float * rgb)1611 STAGE(set_rgb, const float* rgb) {
1612     r = rgb[0];
1613     g = rgb[1];
1614     b = rgb[2];
1615 }
STAGE(unbounded_set_rgb,const float * rgb)1616 STAGE(unbounded_set_rgb, const float* rgb) {
1617     r = rgb[0];
1618     g = rgb[1];
1619     b = rgb[2];
1620 }
1621 
STAGE(swap_rb,Ctx::None)1622 STAGE(swap_rb, Ctx::None) {
1623     auto tmp = r;
1624     r = b;
1625     b = tmp;
1626 }
STAGE(swap_rb_dst,Ctx::None)1627 STAGE(swap_rb_dst, Ctx::None) {
1628     auto tmp = dr;
1629     dr = db;
1630     db = tmp;
1631 }
1632 
STAGE(move_src_dst,Ctx::None)1633 STAGE(move_src_dst, Ctx::None) {
1634     dr = r;
1635     dg = g;
1636     db = b;
1637     da = a;
1638 }
STAGE(move_dst_src,Ctx::None)1639 STAGE(move_dst_src, Ctx::None) {
1640     r = dr;
1641     g = dg;
1642     b = db;
1643     a = da;
1644 }
1645 
STAGE(premul,Ctx::None)1646 STAGE(premul, Ctx::None) {
1647     r = r * a;
1648     g = g * a;
1649     b = b * a;
1650 }
STAGE(premul_dst,Ctx::None)1651 STAGE(premul_dst, Ctx::None) {
1652     dr = dr * da;
1653     dg = dg * da;
1654     db = db * da;
1655 }
STAGE(unpremul,Ctx::None)1656 STAGE(unpremul, Ctx::None) {
1657     float inf = bit_cast<float>(0x7f800000);
1658     auto scale = if_then_else(1.0f/a < inf, 1.0f/a, 0);
1659     r *= scale;
1660     g *= scale;
1661     b *= scale;
1662 }
1663 
STAGE(force_opaque,Ctx::None)1664 STAGE(force_opaque    , Ctx::None) {  a = 1; }
STAGE(force_opaque_dst,Ctx::None)1665 STAGE(force_opaque_dst, Ctx::None) { da = 1; }
1666 
STAGE(rgb_to_hsl,Ctx::None)1667 STAGE(rgb_to_hsl, Ctx::None) {
1668     F mx = max(r,g,b),
1669       mn = min(r,g,b),
1670       d = mx - mn,
1671       d_rcp = 1.0f / d;
1672 
1673     F h = (1/6.0f) *
1674           if_then_else(mx == mn, 0,
1675           if_then_else(mx ==  r, (g-b)*d_rcp + if_then_else(g < b, 6.0f, 0),
1676           if_then_else(mx ==  g, (b-r)*d_rcp + 2.0f,
1677                                  (r-g)*d_rcp + 4.0f)));
1678 
1679     F l = (mx + mn) * 0.5f;
1680     F s = if_then_else(mx == mn, 0,
1681                        d / if_then_else(l > 0.5f, 2.0f-mx-mn, mx+mn));
1682 
1683     r = h;
1684     g = s;
1685     b = l;
1686 }
STAGE(hsl_to_rgb,Ctx::None)1687 STAGE(hsl_to_rgb, Ctx::None) {
1688     F h = r,
1689       s = g,
1690       l = b;
1691 
1692     F q = l + if_then_else(l >= 0.5f, s - l*s, l*s),
1693       p = 2.0f*l - q;
1694 
1695     auto hue_to_rgb = [&](F t) {
1696         t = fract(t);
1697 
1698         F r = p;
1699         r = if_then_else(t >= 4/6.0f, r, p + (q-p)*(4.0f - 6.0f*t));
1700         r = if_then_else(t >= 3/6.0f, r, q);
1701         r = if_then_else(t >= 1/6.0f, r, p + (q-p)*(       6.0f*t));
1702         return r;
1703     };
1704 
1705     r = if_then_else(s == 0, l, hue_to_rgb(h + (1/3.0f)));
1706     g = if_then_else(s == 0, l, hue_to_rgb(h           ));
1707     b = if_then_else(s == 0, l, hue_to_rgb(h - (1/3.0f)));
1708 }
1709 
1710 // Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
alpha_coverage_from_rgb_coverage(F a,F da,F cr,F cg,F cb)1711 SI F alpha_coverage_from_rgb_coverage(F a, F da, F cr, F cg, F cb) {
1712     return if_then_else(a < da, min(cr,cg,cb)
1713                               , max(cr,cg,cb));
1714 }
1715 
STAGE(scale_1_float,const float * c)1716 STAGE(scale_1_float, const float* c) {
1717     r = r * *c;
1718     g = g * *c;
1719     b = b * *c;
1720     a = a * *c;
1721 }
STAGE(scale_u8,const SkRasterPipeline_MemoryCtx * ctx)1722 STAGE(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) {
1723     auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1724 
1725     auto scales = load<U8>(ptr, tail);
1726     auto c = from_byte(scales);
1727 
1728     r = r * c;
1729     g = g * c;
1730     b = b * c;
1731     a = a * c;
1732 }
STAGE(scale_565,const SkRasterPipeline_MemoryCtx * ctx)1733 STAGE(scale_565, const SkRasterPipeline_MemoryCtx* ctx) {
1734     auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1735 
1736     F cr,cg,cb;
1737     from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
1738 
1739     F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
1740 
1741     r = r * cr;
1742     g = g * cg;
1743     b = b * cb;
1744     a = a * ca;
1745 }
1746 
lerp(F from,F to,F t)1747 SI F lerp(F from, F to, F t) {
1748     return mad(to-from, t, from);
1749 }
1750 
STAGE(lerp_1_float,const float * c)1751 STAGE(lerp_1_float, const float* c) {
1752     r = lerp(dr, r, *c);
1753     g = lerp(dg, g, *c);
1754     b = lerp(db, b, *c);
1755     a = lerp(da, a, *c);
1756 }
STAGE(lerp_native,const float scales[])1757 STAGE(lerp_native, const float scales[]) {
1758     auto c = sk_unaligned_load<F>(scales);
1759     r = lerp(dr, r, c);
1760     g = lerp(dg, g, c);
1761     b = lerp(db, b, c);
1762     a = lerp(da, a, c);
1763 }
STAGE(lerp_u8,const SkRasterPipeline_MemoryCtx * ctx)1764 STAGE(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) {
1765     auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1766 
1767     auto scales = load<U8>(ptr, tail);
1768     auto c = from_byte(scales);
1769 
1770     r = lerp(dr, r, c);
1771     g = lerp(dg, g, c);
1772     b = lerp(db, b, c);
1773     a = lerp(da, a, c);
1774 }
STAGE(lerp_565,const SkRasterPipeline_MemoryCtx * ctx)1775 STAGE(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) {
1776     auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1777 
1778     F cr,cg,cb;
1779     from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
1780 
1781     F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
1782 
1783     r = lerp(dr, r, cr);
1784     g = lerp(dg, g, cg);
1785     b = lerp(db, b, cb);
1786     a = lerp(da, a, ca);
1787 }
1788 
STAGE(emboss,const SkRasterPipeline_EmbossCtx * ctx)1789 STAGE(emboss, const SkRasterPipeline_EmbossCtx* ctx) {
1790     auto mptr = ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy),
1791          aptr = ptr_at_xy<const uint8_t>(&ctx->add, dx,dy);
1792 
1793     F mul = from_byte(load<U8>(mptr, tail)),
1794       add = from_byte(load<U8>(aptr, tail));
1795 
1796     r = mad(r, mul, add);
1797     g = mad(g, mul, add);
1798     b = mad(b, mul, add);
1799 }
1800 
STAGE(byte_tables,const void * ctx)1801 STAGE(byte_tables, const void* ctx) {  // TODO: rename Tables SkRasterPipeline_ByteTablesCtx
1802     struct Tables { const uint8_t *r, *g, *b, *a; };
1803     auto tables = (const Tables*)ctx;
1804 
1805     r = from_byte(gather(tables->r, to_unorm(r, 255)));
1806     g = from_byte(gather(tables->g, to_unorm(g, 255)));
1807     b = from_byte(gather(tables->b, to_unorm(b, 255)));
1808     a = from_byte(gather(tables->a, to_unorm(a, 255)));
1809 }
1810 
strip_sign(F x,U32 * sign)1811 SI F strip_sign(F x, U32* sign) {
1812     U32 bits = bit_cast<U32>(x);
1813     *sign = bits & 0x80000000;
1814     return bit_cast<F>(bits ^ *sign);
1815 }
1816 
apply_sign(F x,U32 sign)1817 SI F apply_sign(F x, U32 sign) {
1818     return bit_cast<F>(sign | bit_cast<U32>(x));
1819 }
1820 
STAGE(parametric,const skcms_TransferFunction * ctx)1821 STAGE(parametric, const skcms_TransferFunction* ctx) {
1822     auto fn = [&](F v) {
1823         U32 sign;
1824         v = strip_sign(v, &sign);
1825 
1826         F r = if_then_else(v <= ctx->d, mad(ctx->c, v, ctx->f)
1827                                       , approx_powf(mad(ctx->a, v, ctx->b), ctx->g) + ctx->e);
1828         return apply_sign(r, sign);
1829     };
1830     r = fn(r);
1831     g = fn(g);
1832     b = fn(b);
1833 }
1834 
STAGE(gamma_,const float * G)1835 STAGE(gamma_, const float* G) {
1836     auto fn = [&](F v) {
1837         U32 sign;
1838         v = strip_sign(v, &sign);
1839         return apply_sign(approx_powf(v, *G), sign);
1840     };
1841     r = fn(r);
1842     g = fn(g);
1843     b = fn(b);
1844 }
1845 
STAGE(from_srgb,Ctx::None)1846 STAGE(from_srgb, Ctx::None) {
1847     auto fn = [](F s) {
1848         U32 sign;
1849         s = strip_sign(s, &sign);
1850         auto lo = s * (1/12.92f);
1851         auto hi = mad(s*s, mad(s, 0.3000f, 0.6975f), 0.0025f);
1852         return apply_sign(if_then_else(s < 0.055f, lo, hi), sign);
1853     };
1854     r = fn(r);
1855     g = fn(g);
1856     b = fn(b);
1857 }
STAGE(to_srgb,Ctx::None)1858 STAGE(to_srgb, Ctx::None) {
1859     auto fn = [](F l) {
1860         U32 sign;
1861         l = strip_sign(l, &sign);
1862         // We tweak c and d for each instruction set to make sure fn(1) is exactly 1.
1863     #if defined(JUMPER_IS_AVX512)
1864         const float c = 1.130026340485f,
1865                     d = 0.141387879848f;
1866     #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || \
1867           defined(JUMPER_IS_AVX ) || defined(JUMPER_IS_HSW )
1868         const float c = 1.130048394203f,
1869                     d = 0.141357362270f;
1870     #elif defined(JUMPER_IS_NEON)
1871         const float c = 1.129999995232f,
1872                     d = 0.141381442547f;
1873     #else
1874         const float c = 1.129999995232f,
1875                     d = 0.141377761960f;
1876     #endif
1877         F t = rsqrt(l);
1878         auto lo = l * 12.92f;
1879         auto hi = mad(t, mad(t, -0.0024542345f, 0.013832027f), c)
1880                 * rcp(d + t);
1881         return apply_sign(if_then_else(l < 0.00465985f, lo, hi), sign);
1882     };
1883     r = fn(r);
1884     g = fn(g);
1885     b = fn(b);
1886 }
1887 
STAGE(load_a8,const SkRasterPipeline_MemoryCtx * ctx)1888 STAGE(load_a8, const SkRasterPipeline_MemoryCtx* ctx) {
1889     auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1890 
1891     r = g = b = 0.0f;
1892     a = from_byte(load<U8>(ptr, tail));
1893 }
STAGE(load_a8_dst,const SkRasterPipeline_MemoryCtx * ctx)1894 STAGE(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) {
1895     auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1896 
1897     dr = dg = db = 0.0f;
1898     da = from_byte(load<U8>(ptr, tail));
1899 }
STAGE(gather_a8,const SkRasterPipeline_GatherCtx * ctx)1900 STAGE(gather_a8, const SkRasterPipeline_GatherCtx* ctx) {
1901     const uint8_t* ptr;
1902     U32 ix = ix_and_ptr(&ptr, ctx, r,g);
1903     r = g = b = 0.0f;
1904     a = from_byte(gather(ptr, ix));
1905 }
STAGE(store_a8,const SkRasterPipeline_MemoryCtx * ctx)1906 STAGE(store_a8, const SkRasterPipeline_MemoryCtx* ctx) {
1907     auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);
1908 
1909     U8 packed = pack(pack(to_unorm(a, 255)));
1910     store(ptr, packed, tail);
1911 }
1912 
STAGE(load_565,const SkRasterPipeline_MemoryCtx * ctx)1913 STAGE(load_565, const SkRasterPipeline_MemoryCtx* ctx) {
1914     auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1915 
1916     from_565(load<U16>(ptr, tail), &r,&g,&b);
1917     a = 1.0f;
1918 }
STAGE(load_565_dst,const SkRasterPipeline_MemoryCtx * ctx)1919 STAGE(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) {
1920     auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1921 
1922     from_565(load<U16>(ptr, tail), &dr,&dg,&db);
1923     da = 1.0f;
1924 }
STAGE(gather_565,const SkRasterPipeline_GatherCtx * ctx)1925 STAGE(gather_565, const SkRasterPipeline_GatherCtx* ctx) {
1926     const uint16_t* ptr;
1927     U32 ix = ix_and_ptr(&ptr, ctx, r,g);
1928     from_565(gather(ptr, ix), &r,&g,&b);
1929     a = 1.0f;
1930 }
STAGE(store_565,const SkRasterPipeline_MemoryCtx * ctx)1931 STAGE(store_565, const SkRasterPipeline_MemoryCtx* ctx) {
1932     auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
1933 
1934     U16 px = pack( to_unorm(r, 31) << 11
1935                  | to_unorm(g, 63) <<  5
1936                  | to_unorm(b, 31)      );
1937     store(ptr, px, tail);
1938 }
1939 
STAGE(load_4444,const SkRasterPipeline_MemoryCtx * ctx)1940 STAGE(load_4444, const SkRasterPipeline_MemoryCtx* ctx) {
1941     auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1942     from_4444(load<U16>(ptr, tail), &r,&g,&b,&a);
1943 }
STAGE(load_4444_dst,const SkRasterPipeline_MemoryCtx * ctx)1944 STAGE(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) {
1945     auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1946     from_4444(load<U16>(ptr, tail), &dr,&dg,&db,&da);
1947 }
STAGE(gather_4444,const SkRasterPipeline_GatherCtx * ctx)1948 STAGE(gather_4444, const SkRasterPipeline_GatherCtx* ctx) {
1949     const uint16_t* ptr;
1950     U32 ix = ix_and_ptr(&ptr, ctx, r,g);
1951     from_4444(gather(ptr, ix), &r,&g,&b,&a);
1952 }
STAGE(store_4444,const SkRasterPipeline_MemoryCtx * ctx)1953 STAGE(store_4444, const SkRasterPipeline_MemoryCtx* ctx) {
1954     auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
1955     U16 px = pack( to_unorm(r, 15) << 12
1956                  | to_unorm(g, 15) <<  8
1957                  | to_unorm(b, 15) <<  4
1958                  | to_unorm(a, 15)      );
1959     store(ptr, px, tail);
1960 }
1961 
STAGE(load_8888,const SkRasterPipeline_MemoryCtx * ctx)1962 STAGE(load_8888, const SkRasterPipeline_MemoryCtx* ctx) {
1963     auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
1964     from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
1965 }
STAGE(load_8888_dst,const SkRasterPipeline_MemoryCtx * ctx)1966 STAGE(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) {
1967     auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
1968     from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da);
1969 }
STAGE(gather_8888,const SkRasterPipeline_GatherCtx * ctx)1970 STAGE(gather_8888, const SkRasterPipeline_GatherCtx* ctx) {
1971     const uint32_t* ptr;
1972     U32 ix = ix_and_ptr(&ptr, ctx, r,g);
1973     from_8888(gather(ptr, ix), &r,&g,&b,&a);
1974 }
STAGE(store_8888,const SkRasterPipeline_MemoryCtx * ctx)1975 STAGE(store_8888, const SkRasterPipeline_MemoryCtx* ctx) {
1976     auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
1977 
1978     U32 px = to_unorm(r, 255)
1979            | to_unorm(g, 255) <<  8
1980            | to_unorm(b, 255) << 16
1981            | to_unorm(a, 255) << 24;
1982     store(ptr, px, tail);
1983 }
1984 
STAGE(load_rg88,const SkRasterPipeline_MemoryCtx * ctx)1985 STAGE(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
1986     auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1987     b = 0;
1988     a = 1;
1989     from_88(load<U16>(ptr, tail), &r,&g);
1990 }
STAGE(store_rg88,const SkRasterPipeline_MemoryCtx * ctx)1991 STAGE(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
1992     auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
1993 
1994     U16 px = pack( to_unorm(r, 255)
1995                  | to_unorm(g, 255) <<  8);
1996     store(ptr, px, tail);
1997 }
1998 
STAGE(load_a16,const SkRasterPipeline_MemoryCtx * ctx)1999 STAGE(load_a16, const SkRasterPipeline_MemoryCtx* ctx) {
2000     auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2001     r = g = b = 0;
2002     a = from_short(load<U16>(ptr, tail));
2003 }
STAGE(store_a16,const SkRasterPipeline_MemoryCtx * ctx)2004 STAGE(store_a16, const SkRasterPipeline_MemoryCtx* ctx) {
2005     auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2006 
2007     U16 px = pack(to_unorm(a, 65535));
2008     store(ptr, px, tail);
2009 }
STAGE(load_rg1616,const SkRasterPipeline_MemoryCtx * ctx)2010 STAGE(load_rg1616, const SkRasterPipeline_MemoryCtx* ctx) {
2011     auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2012     b = 0; a = 1;
2013     from_1616(load<U32>(ptr, tail), &r,&g);
2014 }
STAGE(store_rg1616,const SkRasterPipeline_MemoryCtx * ctx)2015 STAGE(store_rg1616, const SkRasterPipeline_MemoryCtx* ctx) {
2016     auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2017 
2018     U32 px = to_unorm(r, 65535)
2019            | to_unorm(g, 65535) <<  16;
2020     store(ptr, px, tail);
2021 }
STAGE(load_16161616,const SkRasterPipeline_MemoryCtx * ctx)2022 STAGE(load_16161616, const SkRasterPipeline_MemoryCtx* ctx) {
2023     auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
2024     from_16161616(load<U64>(ptr, tail), &r,&g, &b, &a);
2025 }
STAGE(store_16161616,const SkRasterPipeline_MemoryCtx * ctx)2026 STAGE(store_16161616, const SkRasterPipeline_MemoryCtx* ctx) {
2027     auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy);
2028 
2029     U16 R = pack(to_unorm(r, 65535)),
2030         G = pack(to_unorm(g, 65535)),
2031         B = pack(to_unorm(b, 65535)),
2032         A = pack(to_unorm(a, 65535));
2033 
2034     store4(ptr,tail, R,G,B,A);
2035 }
2036 
2037 
STAGE(load_1010102,const SkRasterPipeline_MemoryCtx * ctx)2038 STAGE(load_1010102, const SkRasterPipeline_MemoryCtx* ctx) {
2039     auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2040     from_1010102(load<U32>(ptr, tail), &r,&g,&b,&a);
2041 }
STAGE(load_1010102_dst,const SkRasterPipeline_MemoryCtx * ctx)2042 STAGE(load_1010102_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2043     auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2044     from_1010102(load<U32>(ptr, tail), &dr,&dg,&db,&da);
2045 }
STAGE(gather_1010102,const SkRasterPipeline_GatherCtx * ctx)2046 STAGE(gather_1010102, const SkRasterPipeline_GatherCtx* ctx) {
2047     const uint32_t* ptr;
2048     U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2049     from_1010102(gather(ptr, ix), &r,&g,&b,&a);
2050 }
STAGE(store_1010102,const SkRasterPipeline_MemoryCtx * ctx)2051 STAGE(store_1010102, const SkRasterPipeline_MemoryCtx* ctx) {
2052     auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2053 
2054     U32 px = to_unorm(r, 1023)
2055            | to_unorm(g, 1023) << 10
2056            | to_unorm(b, 1023) << 20
2057            | to_unorm(a,    3) << 30;
2058     store(ptr, px, tail);
2059 }
2060 
STAGE(load_f16,const SkRasterPipeline_MemoryCtx * ctx)2061 STAGE(load_f16, const SkRasterPipeline_MemoryCtx* ctx) {
2062     auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
2063 
2064     U16 R,G,B,A;
2065     load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
2066     r = from_half(R);
2067     g = from_half(G);
2068     b = from_half(B);
2069     a = from_half(A);
2070 }
STAGE(load_f16_dst,const SkRasterPipeline_MemoryCtx * ctx)2071 STAGE(load_f16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2072     auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
2073 
2074     U16 R,G,B,A;
2075     load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
2076     dr = from_half(R);
2077     dg = from_half(G);
2078     db = from_half(B);
2079     da = from_half(A);
2080 }
STAGE(gather_f16,const SkRasterPipeline_GatherCtx * ctx)2081 STAGE(gather_f16, const SkRasterPipeline_GatherCtx* ctx) {
2082     const uint64_t* ptr;
2083     U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2084     auto px = gather(ptr, ix);
2085 
2086     U16 R,G,B,A;
2087     load4((const uint16_t*)&px,0, &R,&G,&B,&A);
2088     r = from_half(R);
2089     g = from_half(G);
2090     b = from_half(B);
2091     a = from_half(A);
2092 }
STAGE(store_f16,const SkRasterPipeline_MemoryCtx * ctx)2093 STAGE(store_f16, const SkRasterPipeline_MemoryCtx* ctx) {
2094     auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy);
2095     store4((uint16_t*)ptr,tail, to_half(r)
2096                               , to_half(g)
2097                               , to_half(b)
2098                               , to_half(a));
2099 }
2100 
STAGE(store_u16_be,const SkRasterPipeline_MemoryCtx * ctx)2101 STAGE(store_u16_be, const SkRasterPipeline_MemoryCtx* ctx) {
2102     auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,dy);
2103 
2104     U16 R = bswap(pack(to_unorm(r, 65535))),
2105         G = bswap(pack(to_unorm(g, 65535))),
2106         B = bswap(pack(to_unorm(b, 65535))),
2107         A = bswap(pack(to_unorm(a, 65535)));
2108 
2109     store4(ptr,tail, R,G,B,A);
2110 }
2111 
STAGE(load_af16,const SkRasterPipeline_MemoryCtx * ctx)2112 STAGE(load_af16, const SkRasterPipeline_MemoryCtx* ctx) {
2113     auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2114 
2115     U16 A = load<U16>((const uint16_t*)ptr, tail);
2116     r = 0;
2117     g = 0;
2118     b = 0;
2119     a = from_half(A);
2120 }
STAGE(store_af16,const SkRasterPipeline_MemoryCtx * ctx)2121 STAGE(store_af16, const SkRasterPipeline_MemoryCtx* ctx) {
2122     auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2123     store(ptr, to_half(a), tail);
2124 }
2125 
STAGE(load_rgf16,const SkRasterPipeline_MemoryCtx * ctx)2126 STAGE(load_rgf16, const SkRasterPipeline_MemoryCtx* ctx) {
2127     auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2128 
2129     U16 R,G;
2130     load2((const uint16_t*)ptr,tail, &R,&G);
2131     r = from_half(R);
2132     g = from_half(G);
2133     b = 0;
2134     a = from_half(0x3C00); // one
2135 }
STAGE(store_rgf16,const SkRasterPipeline_MemoryCtx * ctx)2136 STAGE(store_rgf16, const SkRasterPipeline_MemoryCtx* ctx) {
2137     auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2138     store2((uint16_t*)ptr, tail, to_half(r)
2139                                , to_half(g));
2140 }
2141 
STAGE(load_f32,const SkRasterPipeline_MemoryCtx * ctx)2142 STAGE(load_f32, const SkRasterPipeline_MemoryCtx* ctx) {
2143     auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy);
2144     load4(ptr,tail, &r,&g,&b,&a);
2145 }
STAGE(load_f32_dst,const SkRasterPipeline_MemoryCtx * ctx)2146 STAGE(load_f32_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2147     auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy);
2148     load4(ptr,tail, &dr,&dg,&db,&da);
2149 }
STAGE(gather_f32,const SkRasterPipeline_GatherCtx * ctx)2150 STAGE(gather_f32, const SkRasterPipeline_GatherCtx* ctx) {
2151     const float* ptr;
2152     U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2153     r = gather(ptr, 4*ix + 0);
2154     g = gather(ptr, 4*ix + 1);
2155     b = gather(ptr, 4*ix + 2);
2156     a = gather(ptr, 4*ix + 3);
2157 }
STAGE(store_f32,const SkRasterPipeline_MemoryCtx * ctx)2158 STAGE(store_f32, const SkRasterPipeline_MemoryCtx* ctx) {
2159     auto ptr = ptr_at_xy<float>(ctx, 4*dx,4*dy);
2160     store4(ptr,tail, r,g,b,a);
2161 }
2162 
STAGE(load_rgf32,const SkRasterPipeline_MemoryCtx * ctx)2163 STAGE(load_rgf32, const SkRasterPipeline_MemoryCtx* ctx) {
2164     auto ptr = ptr_at_xy<const float>(ctx, 2*dx,2*dy);
2165     load2(ptr, tail, &r, &g);
2166     b = 0;
2167     a = 1;
2168 }
STAGE(store_rgf32,const SkRasterPipeline_MemoryCtx * ctx)2169 STAGE(store_rgf32, const SkRasterPipeline_MemoryCtx* ctx) {
2170     auto ptr = ptr_at_xy<float>(ctx, 2*dx,2*dy);
2171     store2(ptr, tail, r, g);
2172 }
2173 
exclusive_repeat(F v,const SkRasterPipeline_TileCtx * ctx)2174 SI F exclusive_repeat(F v, const SkRasterPipeline_TileCtx* ctx) {
2175     return v - floor_(v*ctx->invScale)*ctx->scale;
2176 }
exclusive_mirror(F v,const SkRasterPipeline_TileCtx * ctx)2177 SI F exclusive_mirror(F v, const SkRasterPipeline_TileCtx* ctx) {
2178     auto limit = ctx->scale;
2179     auto invLimit = ctx->invScale;
2180     return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit );
2181 }
2182 // Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images).
2183 // The gather stages will hard clamp the output of these stages to [0,limit)...
2184 // we just need to do the basic repeat or mirroring.
STAGE(repeat_x,const SkRasterPipeline_TileCtx * ctx)2185 STAGE(repeat_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_repeat(r, ctx); }
STAGE(repeat_y,const SkRasterPipeline_TileCtx * ctx)2186 STAGE(repeat_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_repeat(g, ctx); }
STAGE(mirror_x,const SkRasterPipeline_TileCtx * ctx)2187 STAGE(mirror_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_mirror(r, ctx); }
STAGE(mirror_y,const SkRasterPipeline_TileCtx * ctx)2188 STAGE(mirror_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_mirror(g, ctx); }
2189 
2190 // Clamp x to [0,1], both sides inclusive (think, gradients).
2191 // Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
clamp_01(F v)2192 SI F clamp_01(F v) { return min(max(0, v), 1); }
2193 
STAGE(clamp_x_1,Ctx::None)2194 STAGE( clamp_x_1, Ctx::None) { r = clamp_01(r); }
STAGE(repeat_x_1,Ctx::None)2195 STAGE(repeat_x_1, Ctx::None) { r = clamp_01(r - floor_(r)); }
STAGE(mirror_x_1,Ctx::None)2196 STAGE(mirror_x_1, Ctx::None) { r = clamp_01(abs_( (r-1.0f) - two(floor_((r-1.0f)*0.5f)) - 1.0f )); }
2197 
2198 // Decal stores a 32bit mask after checking the coordinate (x and/or y) against its domain:
2199 //      mask == 0x00000000 if the coordinate(s) are out of bounds
2200 //      mask == 0xFFFFFFFF if the coordinate(s) are in bounds
2201 // After the gather stage, the r,g,b,a values are AND'd with this mask, setting them to 0
2202 // if either of the coordinates were out of bounds.
2203 
STAGE(decal_x,SkRasterPipeline_DecalTileCtx * ctx)2204 STAGE(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
2205     auto w = ctx->limit_x;
2206     sk_unaligned_store(ctx->mask, cond_to_mask((0 <= r) & (r < w)));
2207 }
STAGE(decal_y,SkRasterPipeline_DecalTileCtx * ctx)2208 STAGE(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
2209     auto h = ctx->limit_y;
2210     sk_unaligned_store(ctx->mask, cond_to_mask((0 <= g) & (g < h)));
2211 }
STAGE(decal_x_and_y,SkRasterPipeline_DecalTileCtx * ctx)2212 STAGE(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
2213     auto w = ctx->limit_x;
2214     auto h = ctx->limit_y;
2215     sk_unaligned_store(ctx->mask,
2216                     cond_to_mask((0 <= r) & (r < w) & (0 <= g) & (g < h)));
2217 }
STAGE(check_decal_mask,SkRasterPipeline_DecalTileCtx * ctx)2218 STAGE(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
2219     auto mask = sk_unaligned_load<U32>(ctx->mask);
2220     r = bit_cast<F>( bit_cast<U32>(r) & mask );
2221     g = bit_cast<F>( bit_cast<U32>(g) & mask );
2222     b = bit_cast<F>( bit_cast<U32>(b) & mask );
2223     a = bit_cast<F>( bit_cast<U32>(a) & mask );
2224 }
2225 
STAGE(alpha_to_gray,Ctx::None)2226 STAGE(alpha_to_gray, Ctx::None) {
2227     r = g = b = a;
2228     a = 1;
2229 }
STAGE(alpha_to_gray_dst,Ctx::None)2230 STAGE(alpha_to_gray_dst, Ctx::None) {
2231     dr = dg = db = da;
2232     da = 1;
2233 }
STAGE(bt709_luminance_or_luma_to_alpha,Ctx::None)2234 STAGE(bt709_luminance_or_luma_to_alpha, Ctx::None) {
2235     a = r*0.2126f + g*0.7152f + b*0.0722f;
2236     r = g = b = 0;
2237 }
2238 
STAGE(matrix_translate,const float * m)2239 STAGE(matrix_translate, const float* m) {
2240     r += m[0];
2241     g += m[1];
2242 }
STAGE(matrix_scale_translate,const float * m)2243 STAGE(matrix_scale_translate, const float* m) {
2244     r = mad(r,m[0], m[2]);
2245     g = mad(g,m[1], m[3]);
2246 }
STAGE(matrix_2x3,const float * m)2247 STAGE(matrix_2x3, const float* m) {
2248     auto R = mad(r,m[0], mad(g,m[2], m[4])),
2249          G = mad(r,m[1], mad(g,m[3], m[5]));
2250     r = R;
2251     g = G;
2252 }
STAGE(matrix_3x3,const float * m)2253 STAGE(matrix_3x3, const float* m) {
2254     auto R = mad(r,m[0], mad(g,m[3], b*m[6])),
2255          G = mad(r,m[1], mad(g,m[4], b*m[7])),
2256          B = mad(r,m[2], mad(g,m[5], b*m[8]));
2257     r = R;
2258     g = G;
2259     b = B;
2260 }
STAGE(matrix_3x4,const float * m)2261 STAGE(matrix_3x4, const float* m) {
2262     auto R = mad(r,m[0], mad(g,m[3], mad(b,m[6], m[ 9]))),
2263          G = mad(r,m[1], mad(g,m[4], mad(b,m[7], m[10]))),
2264          B = mad(r,m[2], mad(g,m[5], mad(b,m[8], m[11])));
2265     r = R;
2266     g = G;
2267     b = B;
2268 }
STAGE(matrix_4x5,const float * m)2269 STAGE(matrix_4x5, const float* m) {
2270     auto R = mad(r,m[ 0], mad(g,m[ 1], mad(b,m[ 2], mad(a,m[ 3], m[ 4])))),
2271          G = mad(r,m[ 5], mad(g,m[ 6], mad(b,m[ 7], mad(a,m[ 8], m[ 9])))),
2272          B = mad(r,m[10], mad(g,m[11], mad(b,m[12], mad(a,m[13], m[14])))),
2273          A = mad(r,m[15], mad(g,m[16], mad(b,m[17], mad(a,m[18], m[19]))));
2274     r = R;
2275     g = G;
2276     b = B;
2277     a = A;
2278 }
STAGE(matrix_4x3,const float * m)2279 STAGE(matrix_4x3, const float* m) {
2280     auto X = r,
2281          Y = g;
2282 
2283     r = mad(X, m[0], mad(Y, m[4], m[ 8]));
2284     g = mad(X, m[1], mad(Y, m[5], m[ 9]));
2285     b = mad(X, m[2], mad(Y, m[6], m[10]));
2286     a = mad(X, m[3], mad(Y, m[7], m[11]));
2287 }
STAGE(matrix_perspective,const float * m)2288 STAGE(matrix_perspective, const float* m) {
2289     // N.B. Unlike the other matrix_ stages, this matrix is row-major.
2290     auto R = mad(r,m[0], mad(g,m[1], m[2])),
2291          G = mad(r,m[3], mad(g,m[4], m[5])),
2292          Z = mad(r,m[6], mad(g,m[7], m[8]));
2293     r = R * rcp(Z);
2294     g = G * rcp(Z);
2295 }
2296 
gradient_lookup(const SkRasterPipeline_GradientCtx * c,U32 idx,F t,F * r,F * g,F * b,F * a)2297 SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t,
2298                         F* r, F* g, F* b, F* a) {
2299     F fr, br, fg, bg, fb, bb, fa, ba;
2300 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
2301     if (c->stopCount <=8) {
2302         fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), idx);
2303         br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), idx);
2304         fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), idx);
2305         bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), idx);
2306         fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), idx);
2307         bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), idx);
2308         fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), idx);
2309         ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), idx);
2310     } else
2311 #endif
2312     {
2313         fr = gather(c->fs[0], idx);
2314         br = gather(c->bs[0], idx);
2315         fg = gather(c->fs[1], idx);
2316         bg = gather(c->bs[1], idx);
2317         fb = gather(c->fs[2], idx);
2318         bb = gather(c->bs[2], idx);
2319         fa = gather(c->fs[3], idx);
2320         ba = gather(c->bs[3], idx);
2321     }
2322 
2323     *r = mad(t, fr, br);
2324     *g = mad(t, fg, bg);
2325     *b = mad(t, fb, bb);
2326     *a = mad(t, fa, ba);
2327 }
2328 
STAGE(evenly_spaced_gradient,const SkRasterPipeline_GradientCtx * c)2329 STAGE(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) {
2330     auto t = r;
2331     auto idx = trunc_(t * (c->stopCount-1));
2332     gradient_lookup(c, idx, t, &r, &g, &b, &a);
2333 }
2334 
STAGE(gradient,const SkRasterPipeline_GradientCtx * c)2335 STAGE(gradient, const SkRasterPipeline_GradientCtx* c) {
2336     auto t = r;
2337     U32 idx = 0;
2338 
2339     // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
2340     for (size_t i = 1; i < c->stopCount; i++) {
2341         idx += if_then_else(t >= c->ts[i], U32(1), U32(0));
2342     }
2343 
2344     gradient_lookup(c, idx, t, &r, &g, &b, &a);
2345 }
2346 
STAGE(evenly_spaced_2_stop_gradient,const void * ctx)2347 STAGE(evenly_spaced_2_stop_gradient, const void* ctx) {
2348     // TODO: Rename Ctx SkRasterPipeline_EvenlySpaced2StopGradientCtx.
2349     struct Ctx { float f[4], b[4]; };
2350     auto c = (const Ctx*)ctx;
2351 
2352     auto t = r;
2353     r = mad(t, c->f[0], c->b[0]);
2354     g = mad(t, c->f[1], c->b[1]);
2355     b = mad(t, c->f[2], c->b[2]);
2356     a = mad(t, c->f[3], c->b[3]);
2357 }
2358 
STAGE(xy_to_unit_angle,Ctx::None)2359 STAGE(xy_to_unit_angle, Ctx::None) {
2360     F X = r,
2361       Y = g;
2362     F xabs = abs_(X),
2363       yabs = abs_(Y);
2364 
2365     F slope = min(xabs, yabs)/max(xabs, yabs);
2366     F s = slope * slope;
2367 
2368     // Use a 7th degree polynomial to approximate atan.
2369     // This was generated using sollya.gforge.inria.fr.
2370     // A float optimized polynomial was generated using the following command.
2371     // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
2372     F phi = slope
2373              * (0.15912117063999176025390625f     + s
2374              * (-5.185396969318389892578125e-2f   + s
2375              * (2.476101927459239959716796875e-2f + s
2376              * (-7.0547382347285747528076171875e-3f))));
2377 
2378     phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi);
2379     phi = if_then_else(X < 0.0f   , 1.0f/2.0f - phi, phi);
2380     phi = if_then_else(Y < 0.0f   , 1.0f - phi     , phi);
2381     phi = if_then_else(phi != phi , 0              , phi);  // Check for NaN.
2382     r = phi;
2383 }
2384 
STAGE(xy_to_radius,Ctx::None)2385 STAGE(xy_to_radius, Ctx::None) {
2386     F X2 = r * r,
2387       Y2 = g * g;
2388     r = sqrt_(X2 + Y2);
2389 }
2390 
2391 // Please see https://skia.org/dev/design/conical for how our 2pt conical shader works.
2392 
STAGE(negate_x,Ctx::None)2393 STAGE(negate_x, Ctx::None) { r = -r; }
2394 
STAGE(xy_to_2pt_conical_strip,const SkRasterPipeline_2PtConicalCtx * ctx)2395 STAGE(xy_to_2pt_conical_strip, const SkRasterPipeline_2PtConicalCtx* ctx) {
2396     F x = r, y = g, &t = r;
2397     t = x + sqrt_(ctx->fP0 - y*y); // ctx->fP0 = r0 * r0
2398 }
2399 
STAGE(xy_to_2pt_conical_focal_on_circle,Ctx::None)2400 STAGE(xy_to_2pt_conical_focal_on_circle, Ctx::None) {
2401     F x = r, y = g, &t = r;
2402     t = x + y*y / x; // (x^2 + y^2) / x
2403 }
2404 
STAGE(xy_to_2pt_conical_well_behaved,const SkRasterPipeline_2PtConicalCtx * ctx)2405 STAGE(xy_to_2pt_conical_well_behaved, const SkRasterPipeline_2PtConicalCtx* ctx) {
2406     F x = r, y = g, &t = r;
2407     t = sqrt_(x*x + y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1
2408 }
2409 
STAGE(xy_to_2pt_conical_greater,const SkRasterPipeline_2PtConicalCtx * ctx)2410 STAGE(xy_to_2pt_conical_greater, const SkRasterPipeline_2PtConicalCtx* ctx) {
2411     F x = r, y = g, &t = r;
2412     t = sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1
2413 }
2414 
STAGE(xy_to_2pt_conical_smaller,const SkRasterPipeline_2PtConicalCtx * ctx)2415 STAGE(xy_to_2pt_conical_smaller, const SkRasterPipeline_2PtConicalCtx* ctx) {
2416     F x = r, y = g, &t = r;
2417     t = -sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1
2418 }
2419 
STAGE(alter_2pt_conical_compensate_focal,const SkRasterPipeline_2PtConicalCtx * ctx)2420 STAGE(alter_2pt_conical_compensate_focal, const SkRasterPipeline_2PtConicalCtx* ctx) {
2421     F& t = r;
2422     t = t + ctx->fP1; // ctx->fP1 = f
2423 }
2424 
STAGE(alter_2pt_conical_unswap,Ctx::None)2425 STAGE(alter_2pt_conical_unswap, Ctx::None) {
2426     F& t = r;
2427     t = 1 - t;
2428 }
2429 
STAGE(mask_2pt_conical_nan,SkRasterPipeline_2PtConicalCtx * c)2430 STAGE(mask_2pt_conical_nan, SkRasterPipeline_2PtConicalCtx* c) {
2431     F& t = r;
2432     auto is_degenerate = (t != t); // NaN
2433     t = if_then_else(is_degenerate, F(0), t);
2434     sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
2435 }
2436 
STAGE(mask_2pt_conical_degenerates,SkRasterPipeline_2PtConicalCtx * c)2437 STAGE(mask_2pt_conical_degenerates, SkRasterPipeline_2PtConicalCtx* c) {
2438     F& t = r;
2439     auto is_degenerate = (t <= 0) | (t != t);
2440     t = if_then_else(is_degenerate, F(0), t);
2441     sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
2442 }
2443 
STAGE(apply_vector_mask,const uint32_t * ctx)2444 STAGE(apply_vector_mask, const uint32_t* ctx) {
2445     const U32 mask = sk_unaligned_load<U32>(ctx);
2446     r = bit_cast<F>(bit_cast<U32>(r) & mask);
2447     g = bit_cast<F>(bit_cast<U32>(g) & mask);
2448     b = bit_cast<F>(bit_cast<U32>(b) & mask);
2449     a = bit_cast<F>(bit_cast<U32>(a) & mask);
2450 }
2451 
STAGE(save_xy,SkRasterPipeline_SamplerCtx * c)2452 STAGE(save_xy, SkRasterPipeline_SamplerCtx* c) {
2453     // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy).
2454     // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
2455     // surrounding (x,y) at (0.5,0.5) off-center.
2456     F fx = fract(r + 0.5f),
2457       fy = fract(g + 0.5f);
2458 
2459     // Samplers will need to load x and fx, or y and fy.
2460     sk_unaligned_store(c->x,  r);
2461     sk_unaligned_store(c->y,  g);
2462     sk_unaligned_store(c->fx, fx);
2463     sk_unaligned_store(c->fy, fy);
2464 }
2465 
STAGE(accumulate,const SkRasterPipeline_SamplerCtx * c)2466 STAGE(accumulate, const SkRasterPipeline_SamplerCtx* c) {
2467     // Bilinear and bicubic filters are both separable, so we produce independent contributions
2468     // from x and y, multiplying them together here to get each pixel's total scale factor.
2469     auto scale = sk_unaligned_load<F>(c->scalex)
2470                * sk_unaligned_load<F>(c->scaley);
2471     dr = mad(scale, r, dr);
2472     dg = mad(scale, g, dg);
2473     db = mad(scale, b, db);
2474     da = mad(scale, a, da);
2475 }
2476 
2477 // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
2478 // are combined in direct proportion to their area overlapping that logical query pixel.
2479 // At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x.
2480 // The y-axis is symmetric.
2481 
2482 template <int kScale>
bilinear_x(SkRasterPipeline_SamplerCtx * ctx,F * x)2483 SI void bilinear_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
2484     *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f);
2485     F fx = sk_unaligned_load<F>(ctx->fx);
2486 
2487     F scalex;
2488     if (kScale == -1) { scalex = 1.0f - fx; }
2489     if (kScale == +1) { scalex =        fx; }
2490     sk_unaligned_store(ctx->scalex, scalex);
2491 }
2492 template <int kScale>
bilinear_y(SkRasterPipeline_SamplerCtx * ctx,F * y)2493 SI void bilinear_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
2494     *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f);
2495     F fy = sk_unaligned_load<F>(ctx->fy);
2496 
2497     F scaley;
2498     if (kScale == -1) { scaley = 1.0f - fy; }
2499     if (kScale == +1) { scaley =        fy; }
2500     sk_unaligned_store(ctx->scaley, scaley);
2501 }
2502 
STAGE(bilinear_nx,SkRasterPipeline_SamplerCtx * ctx)2503 STAGE(bilinear_nx, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<-1>(ctx, &r); }
STAGE(bilinear_px,SkRasterPipeline_SamplerCtx * ctx)2504 STAGE(bilinear_px, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<+1>(ctx, &r); }
STAGE(bilinear_ny,SkRasterPipeline_SamplerCtx * ctx)2505 STAGE(bilinear_ny, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<-1>(ctx, &g); }
STAGE(bilinear_py,SkRasterPipeline_SamplerCtx * ctx)2506 STAGE(bilinear_py, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<+1>(ctx, &g); }
2507 
2508 
2509 // In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample
2510 // pixel center are combined with a non-uniform cubic filter, with higher values near the center.
2511 //
2512 // We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets.
2513 // See GrCubicEffect for details of this particular filter.
2514 
bicubic_near(F t)2515 SI F bicubic_near(F t) {
2516     // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18
2517     return mad(t, mad(t, mad((-21/18.0f), t, (27/18.0f)), (9/18.0f)), (1/18.0f));
2518 }
bicubic_far(F t)2519 SI F bicubic_far(F t) {
2520     // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18)
2521     return (t*t)*mad((7/18.0f), t, (-6/18.0f));
2522 }
2523 
2524 template <int kScale>
bicubic_x(SkRasterPipeline_SamplerCtx * ctx,F * x)2525 SI void bicubic_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
2526     *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f);
2527     F fx = sk_unaligned_load<F>(ctx->fx);
2528 
2529     F scalex;
2530     if (kScale == -3) { scalex = bicubic_far (1.0f - fx); }
2531     if (kScale == -1) { scalex = bicubic_near(1.0f - fx); }
2532     if (kScale == +1) { scalex = bicubic_near(       fx); }
2533     if (kScale == +3) { scalex = bicubic_far (       fx); }
2534     sk_unaligned_store(ctx->scalex, scalex);
2535 }
2536 template <int kScale>
bicubic_y(SkRasterPipeline_SamplerCtx * ctx,F * y)2537 SI void bicubic_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
2538     *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f);
2539     F fy = sk_unaligned_load<F>(ctx->fy);
2540 
2541     F scaley;
2542     if (kScale == -3) { scaley = bicubic_far (1.0f - fy); }
2543     if (kScale == -1) { scaley = bicubic_near(1.0f - fy); }
2544     if (kScale == +1) { scaley = bicubic_near(       fy); }
2545     if (kScale == +3) { scaley = bicubic_far (       fy); }
2546     sk_unaligned_store(ctx->scaley, scaley);
2547 }
2548 
STAGE(bicubic_n3x,SkRasterPipeline_SamplerCtx * ctx)2549 STAGE(bicubic_n3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-3>(ctx, &r); }
STAGE(bicubic_n1x,SkRasterPipeline_SamplerCtx * ctx)2550 STAGE(bicubic_n1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-1>(ctx, &r); }
STAGE(bicubic_p1x,SkRasterPipeline_SamplerCtx * ctx)2551 STAGE(bicubic_p1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+1>(ctx, &r); }
STAGE(bicubic_p3x,SkRasterPipeline_SamplerCtx * ctx)2552 STAGE(bicubic_p3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+3>(ctx, &r); }
2553 
STAGE(bicubic_n3y,SkRasterPipeline_SamplerCtx * ctx)2554 STAGE(bicubic_n3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-3>(ctx, &g); }
STAGE(bicubic_n1y,SkRasterPipeline_SamplerCtx * ctx)2555 STAGE(bicubic_n1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-1>(ctx, &g); }
STAGE(bicubic_p1y,SkRasterPipeline_SamplerCtx * ctx)2556 STAGE(bicubic_p1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+1>(ctx, &g); }
STAGE(bicubic_p3y,SkRasterPipeline_SamplerCtx * ctx)2557 STAGE(bicubic_p3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+3>(ctx, &g); }
2558 
STAGE(callback,SkRasterPipeline_CallbackCtx * c)2559 STAGE(callback, SkRasterPipeline_CallbackCtx* c) {
2560     store4(c->rgba,0, r,g,b,a);
2561     c->fn(c, tail ? tail : N);
2562     load4(c->read_from,0, &r,&g,&b,&a);
2563 }
2564 
2565 // shader:      void main(float x, float y, inout half4 color)
2566 // colorfilter: void main(inout half4 color)
STAGE(interpreter,SkRasterPipeline_InterpreterCtx * c)2567 STAGE(interpreter, SkRasterPipeline_InterpreterCtx* c) {
2568     // If N is less than the interpreter's VecWidth, then we are doing more work than necessary in
2569     // the interpreter. This is a known issue, and will be addressed at some point.
2570     float xx[N], yy[N],
2571           rr[N], gg[N], bb[N], aa[N];
2572 
2573     float*  args[]  = { xx, yy, rr, gg, bb, aa };
2574     float** in_args = args;
2575     int     in_count = 6;
2576 
2577     if (c->shaderConvention) {
2578         // our caller must have called seed_shader to set these
2579         sk_unaligned_store(xx, r);
2580         sk_unaligned_store(yy, g);
2581         sk_unaligned_store(rr, F(c->paintColor.fR));
2582         sk_unaligned_store(gg, F(c->paintColor.fG));
2583         sk_unaligned_store(bb, F(c->paintColor.fB));
2584         sk_unaligned_store(aa, F(c->paintColor.fA));
2585     } else {
2586         in_args += 2;   // skip x,y
2587         in_count = 4;
2588         sk_unaligned_store(rr, r);
2589         sk_unaligned_store(gg, g);
2590         sk_unaligned_store(bb, b);
2591         sk_unaligned_store(aa, a);
2592     }
2593 
2594     SkAssertResult(c->byteCode->runStriped(c->fn, in_args, in_count, tail ? tail : N,
2595                                            (const float*)c->inputs, c->ninputs, nullptr, 0));
2596 
2597     r = sk_unaligned_load<F>(rr);
2598     g = sk_unaligned_load<F>(gg);
2599     b = sk_unaligned_load<F>(bb);
2600     a = sk_unaligned_load<F>(aa);
2601 }
2602 
STAGE(gauss_a_to_rgba,Ctx::None)2603 STAGE(gauss_a_to_rgba, Ctx::None) {
2604     // x = 1 - x;
2605     // exp(-x * x * 4) - 0.018f;
2606     // ... now approximate with quartic
2607     //
2608     const float c4 = -2.26661229133605957031f;
2609     const float c3 = 2.89795351028442382812f;
2610     const float c2 = 0.21345567703247070312f;
2611     const float c1 = 0.15489584207534790039f;
2612     const float c0 = 0.00030726194381713867f;
2613     a = mad(a, mad(a, mad(a, mad(a, c4, c3), c2), c1), c0);
2614     r = a;
2615     g = a;
2616     b = a;
2617 }
2618 
tile(F v,SkTileMode mode,float limit,float invLimit)2619 SI F tile(F v, SkTileMode mode, float limit, float invLimit) {
2620     // The ix_and_ptr() calls in sample() will clamp tile()'s output, so no need to clamp here.
2621     switch (mode) {
2622         case SkTileMode::kDecal:  // TODO, for now fallthrough to clamp
2623         case SkTileMode::kClamp:  return v;
2624         case SkTileMode::kRepeat: return v - floor_(v*invLimit)*limit;
2625         case SkTileMode::kMirror:
2626             return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit );
2627     }
2628     SkUNREACHABLE;
2629 }
2630 
sample(const SkRasterPipeline_SamplerCtx2 * ctx,F x,F y,F * r,F * g,F * b,F * a)2631 SI void sample(const SkRasterPipeline_SamplerCtx2* ctx, F x, F y,
2632                F* r, F* g, F* b, F* a) {
2633     x = tile(x, ctx->tileX, ctx->width , ctx->invWidth );
2634     y = tile(y, ctx->tileY, ctx->height, ctx->invHeight);
2635 
2636     switch (ctx->ct) {
2637         default: *r = *g = *b = *a = 0;  // TODO
2638                  break;
2639 
2640         case kRGBA_8888_SkColorType:
2641         case kBGRA_8888_SkColorType: {
2642             const uint32_t* ptr;
2643             U32 ix = ix_and_ptr(&ptr, ctx, x,y);
2644             from_8888(gather(ptr, ix), r,g,b,a);
2645             if (ctx->ct == kBGRA_8888_SkColorType) {
2646                 std::swap(*r,*b);
2647             }
2648         } break;
2649     }
2650 }
2651 
2652 template <int D>
sampler(const SkRasterPipeline_SamplerCtx2 * ctx,F cx,F cy,const F (& wx)[D],const F (& wy)[D],F * r,F * g,F * b,F * a)2653 SI void sampler(const SkRasterPipeline_SamplerCtx2* ctx,
2654                 F cx, F cy, const F (&wx)[D], const F (&wy)[D],
2655                 F* r, F* g, F* b, F* a) {
2656 
2657     float start = -0.5f*(D-1);
2658 
2659     *r = *g = *b = *a = 0;
2660     F y = cy + start;
2661     for (int j = 0; j < D; j++, y += 1.0f) {
2662         F x = cx + start;
2663         for (int i = 0; i < D; i++, x += 1.0f) {
2664             F R,G,B,A;
2665             sample(ctx, x,y, &R,&G,&B,&A);
2666 
2667             F w = wx[i] * wy[j];
2668             *r = mad(w,R,*r);
2669             *g = mad(w,G,*g);
2670             *b = mad(w,B,*b);
2671             *a = mad(w,A,*a);
2672         }
2673     }
2674 }
2675 
STAGE(bilinear,const SkRasterPipeline_SamplerCtx2 * ctx)2676 STAGE(bilinear, const SkRasterPipeline_SamplerCtx2* ctx) {
2677     F x = r, fx = fract(x + 0.5f),
2678       y = g, fy = fract(y + 0.5f);
2679     const F wx[] = {1.0f - fx, fx};
2680     const F wy[] = {1.0f - fy, fy};
2681 
2682     sampler(ctx, x,y, wx,wy, &r,&g,&b,&a);
2683 }
STAGE(bicubic,SkRasterPipeline_SamplerCtx2 * ctx)2684 STAGE(bicubic, SkRasterPipeline_SamplerCtx2* ctx) {
2685     F x = r, fx = fract(x + 0.5f),
2686       y = g, fy = fract(y + 0.5f);
2687     const F wx[] = { bicubic_far(1-fx), bicubic_near(1-fx), bicubic_near(fx), bicubic_far(fx) };
2688     const F wy[] = { bicubic_far(1-fy), bicubic_near(1-fy), bicubic_near(fy), bicubic_far(fy) };
2689 
2690     sampler(ctx, x,y, wx,wy, &r,&g,&b,&a);
2691 }
2692 
2693 // A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
STAGE(bilerp_clamp_8888,const SkRasterPipeline_GatherCtx * ctx)2694 STAGE(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
2695     // (cx,cy) are the center of our sample.
2696     F cx = r,
2697       cy = g;
2698 
2699     // All sample points are at the same fractional offset (fx,fy).
2700     // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
2701     F fx = fract(cx + 0.5f),
2702       fy = fract(cy + 0.5f);
2703 
2704     // We'll accumulate the color of all four samples into {r,g,b,a} directly.
2705     r = g = b = a = 0;
2706 
2707     for (float dy = -0.5f; dy <= +0.5f; dy += 1.0f)
2708     for (float dx = -0.5f; dx <= +0.5f; dx += 1.0f) {
2709         // (x,y) are the coordinates of this sample point.
2710         F x = cx + dx,
2711           y = cy + dy;
2712 
2713         // ix_and_ptr() will clamp to the image's bounds for us.
2714         const uint32_t* ptr;
2715         U32 ix = ix_and_ptr(&ptr, ctx, x,y);
2716 
2717         F sr,sg,sb,sa;
2718         from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
2719 
2720         // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
2721         // are combined in direct proportion to their area overlapping that logical query pixel.
2722         // At positive offsets, the x-axis contribution to that rectangle is fx,
2723         // or (1-fx) at negative x.  Same deal for y.
2724         F sx = (dx > 0) ? fx : 1.0f - fx,
2725           sy = (dy > 0) ? fy : 1.0f - fy,
2726           area = sx * sy;
2727 
2728         r += sr * area;
2729         g += sg * area;
2730         b += sb * area;
2731         a += sa * area;
2732     }
2733 }
2734 
2735 // A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
STAGE(bicubic_clamp_8888,const SkRasterPipeline_GatherCtx * ctx)2736 STAGE(bicubic_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
2737     // (cx,cy) are the center of our sample.
2738     F cx = r,
2739       cy = g;
2740 
2741     // All sample points are at the same fractional offset (fx,fy).
2742     // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
2743     F fx = fract(cx + 0.5f),
2744       fy = fract(cy + 0.5f);
2745 
2746     // We'll accumulate the color of all four samples into {r,g,b,a} directly.
2747     r = g = b = a = 0;
2748 
2749     const F scaley[4] = {
2750         bicubic_far (1.0f - fy), bicubic_near(1.0f - fy),
2751         bicubic_near(       fy), bicubic_far (       fy),
2752     };
2753     const F scalex[4] = {
2754         bicubic_far (1.0f - fx), bicubic_near(1.0f - fx),
2755         bicubic_near(       fx), bicubic_far (       fx),
2756     };
2757 
2758     F sample_y = cy - 1.5f;
2759     for (int yy = 0; yy <= 3; ++yy) {
2760         F sample_x = cx - 1.5f;
2761         for (int xx = 0; xx <= 3; ++xx) {
2762             F scale = scalex[xx] * scaley[yy];
2763 
2764             // ix_and_ptr() will clamp to the image's bounds for us.
2765             const uint32_t* ptr;
2766             U32 ix = ix_and_ptr(&ptr, ctx, sample_x, sample_y);
2767 
2768             F sr,sg,sb,sa;
2769             from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
2770 
2771             r = mad(scale, sr, r);
2772             g = mad(scale, sg, g);
2773             b = mad(scale, sb, b);
2774             a = mad(scale, sa, a);
2775 
2776             sample_x += 1;
2777         }
2778         sample_y += 1;
2779     }
2780 }
2781 
2782 // ~~~~~~ GrSwizzle stage ~~~~~~ //
2783 
STAGE(swizzle,void * ctx)2784 STAGE(swizzle, void* ctx) {
2785     auto ir = r, ig = g, ib = b, ia = a;
2786     F* o[] = {&r, &g, &b, &a};
2787     char swiz[4];
2788     memcpy(swiz, &ctx, sizeof(swiz));
2789 
2790     for (int i = 0; i < 4; ++i) {
2791         switch (swiz[i]) {
2792             case 'r': *o[i] = ir;   break;
2793             case 'g': *o[i] = ig;   break;
2794             case 'b': *o[i] = ib;   break;
2795             case 'a': *o[i] = ia;   break;
2796             case '0': *o[i] = F(0); break;
2797             case '1': *o[i] = F(1); break;
2798             default:                break;
2799         }
2800     }
2801 }
2802 
2803 namespace lowp {
2804 #if defined(JUMPER_IS_SCALAR) || defined(SK_DISABLE_LOWP_RASTER_PIPELINE)
2805     // If we're not compiled by Clang, or otherwise switched into scalar mode (old Clang, manually),
2806     // we don't generate lowp stages.  All these nullptrs will tell SkJumper.cpp to always use the
2807     // highp float pipeline.
2808     #define M(st) static void (*st)(void) = nullptr;
2809         SK_RASTER_PIPELINE_STAGES(M)
2810     #undef M
2811     static void (*just_return)(void) = nullptr;
2812 
start_pipeline(size_t,size_t,size_t,size_t,void **)2813     static void start_pipeline(size_t,size_t,size_t,size_t, void**) {}
2814 
2815 #else  // We are compiling vector code with Clang... let's make some lowp stages!
2816 
2817 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
2818     using U8  = uint8_t  __attribute__((ext_vector_type(16)));
2819     using U16 = uint16_t __attribute__((ext_vector_type(16)));
2820     using I16 =  int16_t __attribute__((ext_vector_type(16)));
2821     using I32 =  int32_t __attribute__((ext_vector_type(16)));
2822     using U32 = uint32_t __attribute__((ext_vector_type(16)));
2823     using F   = float    __attribute__((ext_vector_type(16)));
2824 #else
2825     using U8  = uint8_t  __attribute__((ext_vector_type(8)));
2826     using U16 = uint16_t __attribute__((ext_vector_type(8)));
2827     using I16 =  int16_t __attribute__((ext_vector_type(8)));
2828     using I32 =  int32_t __attribute__((ext_vector_type(8)));
2829     using U32 = uint32_t __attribute__((ext_vector_type(8)));
2830     using F   = float    __attribute__((ext_vector_type(8)));
2831 #endif
2832 
2833 static const size_t N = sizeof(U16) / sizeof(uint16_t);
2834 
2835 // Once again, some platforms benefit from a restricted Stage calling convention,
2836 // but others can pass tons and tons of registers and we're happy to exploit that.
2837 // It's exactly the same decision and implementation strategy as the F stages above.
2838 #if JUMPER_NARROW_STAGES
2839     struct Params {
2840         size_t dx, dy, tail;
2841         U16 dr,dg,db,da;
2842     };
2843     using Stage = void(ABI*)(Params*, void** program, U16 r, U16 g, U16 b, U16 a);
2844 #else
2845     // We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64.
2846     using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
2847                               U16  r, U16  g, U16  b, U16  a,
2848                               U16 dr, U16 dg, U16 db, U16 da);
2849 #endif
2850 
2851 static void start_pipeline(const size_t x0,     const size_t y0,
2852                            const size_t xlimit, const size_t ylimit, void** program) {
2853     auto start = (Stage)load_and_inc(program);
2854     for (size_t dy = y0; dy < ylimit; dy++) {
2855     #if JUMPER_NARROW_STAGES
2856         Params params = { x0,dy,0, 0,0,0,0 };
2857         for (; params.dx + N <= xlimit; params.dx += N) {
2858             start(&params,program, 0,0,0,0);
2859         }
2860         if (size_t tail = xlimit - params.dx) {
2861             params.tail = tail;
2862             start(&params,program, 0,0,0,0);
2863         }
2864     #else
2865         size_t dx = x0;
2866         for (; dx + N <= xlimit; dx += N) {
2867             start(   0,program,dx,dy, 0,0,0,0, 0,0,0,0);
2868         }
2869         if (size_t tail = xlimit - dx) {
2870             start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
2871         }
2872     #endif
2873     }
2874 }
2875 
2876 #if JUMPER_NARROW_STAGES
2877     static void ABI just_return(Params*, void**, U16,U16,U16,U16) {}
2878 #else
2879     static void ABI just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {}
2880 #endif
2881 
2882 // All stages use the same function call ABI to chain into each other, but there are three types:
2883 //   GG: geometry in, geometry out  -- think, a matrix
2884 //   GP: geometry in, pixels out.   -- think, a memory gather
2885 //   PP: pixels in, pixels out.     -- think, a blend mode
2886 //
2887 // (Some stages ignore their inputs or produce no logical output.  That's perfectly fine.)
2888 //
2889 // These three STAGE_ macros let you define each type of stage,
2890 // and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
2891 
2892 #if JUMPER_NARROW_STAGES
2893     #define STAGE_GG(name, ...)                                                                \
2894         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y);          \
2895         static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) {     \
2896             auto x = join<F>(r,g),                                                             \
2897                  y = join<F>(b,a);                                                             \
2898             name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y);                   \
2899             split(x, &r,&g);                                                                   \
2900             split(y, &b,&a);                                                                   \
2901             auto next = (Stage)load_and_inc(program);                                          \
2902             next(params,program, r,g,b,a);                                                     \
2903         }                                                                                      \
2904         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
2905 
2906     #define STAGE_GP(name, ...)                                                            \
2907         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
2908                          U16&  r, U16&  g, U16&  b, U16&  a,                               \
2909                          U16& dr, U16& dg, U16& db, U16& da);                              \
2910         static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
2911             auto x = join<F>(r,g),                                                         \
2912                  y = join<F>(b,a);                                                         \
2913             name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a,       \
2914                      params->dr,params->dg,params->db,params->da);                         \
2915             auto next = (Stage)load_and_inc(program);                                      \
2916             next(params,program, r,g,b,a);                                                 \
2917         }                                                                                  \
2918         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
2919                          U16&  r, U16&  g, U16&  b, U16&  a,                               \
2920                          U16& dr, U16& dg, U16& db, U16& da)
2921 
2922     #define STAGE_PP(name, ...)                                                            \
2923         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
2924                          U16&  r, U16&  g, U16&  b, U16&  a,                               \
2925                          U16& dr, U16& dg, U16& db, U16& da);                              \
2926         static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
2927             name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a,            \
2928                      params->dr,params->dg,params->db,params->da);                         \
2929             auto next = (Stage)load_and_inc(program);                                      \
2930             next(params,program, r,g,b,a);                                                 \
2931         }                                                                                  \
2932         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
2933                          U16&  r, U16&  g, U16&  b, U16&  a,                               \
2934                          U16& dr, U16& dg, U16& db, U16& da)
2935 #else
2936     #define STAGE_GG(name, ...)                                                            \
2937         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y);      \
2938         static void ABI name(size_t tail, void** program, size_t dx, size_t dy,            \
2939                              U16  r, U16  g, U16  b, U16  a,                               \
2940                              U16 dr, U16 dg, U16 db, U16 da) {                             \
2941             auto x = join<F>(r,g),                                                         \
2942                  y = join<F>(b,a);                                                         \
2943             name##_k(Ctx{program}, dx,dy,tail, x,y);                                       \
2944             split(x, &r,&g);                                                               \
2945             split(y, &b,&a);                                                               \
2946             auto next = (Stage)load_and_inc(program);                                      \
2947             next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
2948         }                                                                                  \
2949         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
2950 
2951     #define STAGE_GP(name, ...)                                                            \
2952         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
2953                          U16&  r, U16&  g, U16&  b, U16&  a,                               \
2954                          U16& dr, U16& dg, U16& db, U16& da);                              \
2955         static void ABI name(size_t tail, void** program, size_t dx, size_t dy,            \
2956                              U16  r, U16  g, U16  b, U16  a,                               \
2957                              U16 dr, U16 dg, U16 db, U16 da) {                             \
2958             auto x = join<F>(r,g),                                                         \
2959                  y = join<F>(b,a);                                                         \
2960             name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da);                 \
2961             auto next = (Stage)load_and_inc(program);                                      \
2962             next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
2963         }                                                                                  \
2964         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
2965                          U16&  r, U16&  g, U16&  b, U16&  a,                               \
2966                          U16& dr, U16& dg, U16& db, U16& da)
2967 
2968     #define STAGE_PP(name, ...)                                                            \
2969         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
2970                          U16&  r, U16&  g, U16&  b, U16&  a,                               \
2971                          U16& dr, U16& dg, U16& db, U16& da);                              \
2972         static void ABI name(size_t tail, void** program, size_t dx, size_t dy,            \
2973                              U16  r, U16  g, U16  b, U16  a,                               \
2974                              U16 dr, U16 dg, U16 db, U16 da) {                             \
2975             name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da);                      \
2976             auto next = (Stage)load_and_inc(program);                                      \
2977             next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
2978         }                                                                                  \
2979         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
2980                          U16&  r, U16&  g, U16&  b, U16&  a,                               \
2981                          U16& dr, U16& dg, U16& db, U16& da)
2982 #endif
2983 
2984 // ~~~~~~ Commonly used helper functions ~~~~~~ //
2985 
2986 SI U16 div255(U16 v) {
2987 #if 0
2988     return (v+127)/255;  // The ideal rounding divide by 255.
2989 #elif 1 && defined(JUMPER_IS_NEON)
2990     // With NEON we can compute (v+127)/255 as (v + ((v+128)>>8) + 128)>>8
2991     // just as fast as we can do the approximation below, so might as well be correct!
2992     // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up.
2993     return vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8);
2994 #else
2995     return (v+255)/256;  // A good approximation of (v+127)/255.
2996 #endif
2997 }
2998 
2999 SI U16 inv(U16 v) { return 255-v; }
3000 
3001 SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); }
3002 SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) | (e & ~c); }
3003 
3004 SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); }
3005 SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); }
3006 SI U16 max(U16 x, U16 y, U16 z) { return max(x, max(y, z)); }
3007 SI U16 min(U16 x, U16 y, U16 z) { return min(x, min(y, z)); }
3008 
3009 SI U16 from_float(float f) { return f * 255.0f + 0.5f; }
3010 
3011 SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); }
3012 
3013 template <typename D, typename S>
3014 SI D cast(S src) {
3015     return __builtin_convertvector(src, D);
3016 }
3017 
3018 template <typename D, typename S>
3019 SI void split(S v, D* lo, D* hi) {
3020     static_assert(2*sizeof(D) == sizeof(S), "");
3021     memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D));
3022     memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D));
3023 }
3024 template <typename D, typename S>
3025 SI D join(S lo, S hi) {
3026     static_assert(sizeof(D) == 2*sizeof(S), "");
3027     D v;
3028     memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S));
3029     memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S));
3030     return v;
3031 }
3032 
3033 SI F if_then_else(I32 c, F t, F e) {
3034     return bit_cast<F>( (bit_cast<I32>(t) & c) | (bit_cast<I32>(e) & ~c) );
3035 }
3036 SI F max(F x, F y) { return if_then_else(x < y, y, x); }
3037 SI F min(F x, F y) { return if_then_else(x < y, x, y); }
3038 
3039 SI F mad(F f, F m, F a) { return f*m+a; }
3040 SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
3041 
3042 SI F rcp(F x) {
3043 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
3044     __m256 lo,hi;
3045     split(x, &lo,&hi);
3046     return join<F>(_mm256_rcp_ps(lo), _mm256_rcp_ps(hi));
3047 #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
3048     __m128 lo,hi;
3049     split(x, &lo,&hi);
3050     return join<F>(_mm_rcp_ps(lo), _mm_rcp_ps(hi));
3051 #elif defined(JUMPER_IS_NEON)
3052     auto rcp = [](float32x4_t v) {
3053         auto est = vrecpeq_f32(v);
3054         return vrecpsq_f32(v,est)*est;
3055     };
3056     float32x4_t lo,hi;
3057     split(x, &lo,&hi);
3058     return join<F>(rcp(lo), rcp(hi));
3059 #else
3060     return 1.0f / x;
3061 #endif
3062 }
3063 SI F sqrt_(F x) {
3064 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
3065     __m256 lo,hi;
3066     split(x, &lo,&hi);
3067     return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi));
3068 #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
3069     __m128 lo,hi;
3070     split(x, &lo,&hi);
3071     return join<F>(_mm_sqrt_ps(lo), _mm_sqrt_ps(hi));
3072 #elif defined(SK_CPU_ARM64)
3073     float32x4_t lo,hi;
3074     split(x, &lo,&hi);
3075     return join<F>(vsqrtq_f32(lo), vsqrtq_f32(hi));
3076 #elif defined(JUMPER_IS_NEON)
3077     auto sqrt = [](float32x4_t v) {
3078         auto est = vrsqrteq_f32(v);  // Estimate and two refinement steps for est = rsqrt(v).
3079         est *= vrsqrtsq_f32(v,est*est);
3080         est *= vrsqrtsq_f32(v,est*est);
3081         return v*est;                // sqrt(v) == v*rsqrt(v).
3082     };
3083     float32x4_t lo,hi;
3084     split(x, &lo,&hi);
3085     return join<F>(sqrt(lo), sqrt(hi));
3086 #else
3087     return F{
3088         sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]),
3089         sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]),
3090     };
3091 #endif
3092 }
3093 
3094 SI F floor_(F x) {
3095 #if defined(SK_CPU_ARM64)
3096     float32x4_t lo,hi;
3097     split(x, &lo,&hi);
3098     return join<F>(vrndmq_f32(lo), vrndmq_f32(hi));
3099 #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
3100     __m256 lo,hi;
3101     split(x, &lo,&hi);
3102     return join<F>(_mm256_floor_ps(lo), _mm256_floor_ps(hi));
3103 #elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
3104     __m128 lo,hi;
3105     split(x, &lo,&hi);
3106     return join<F>(_mm_floor_ps(lo), _mm_floor_ps(hi));
3107 #else
3108     F roundtrip = cast<F>(cast<I32>(x));
3109     return roundtrip - if_then_else(roundtrip > x, F(1), F(0));
3110 #endif
3111 }
3112 SI F fract(F x) { return x - floor_(x); }
3113 SI F abs_(F x) { return bit_cast<F>( bit_cast<I32>(x) & 0x7fffffff ); }
3114 
3115 // ~~~~~~ Basic / misc. stages ~~~~~~ //
3116 
3117 STAGE_GG(seed_shader, Ctx::None) {
3118     static const float iota[] = {
3119         0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
3120         8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
3121     };
3122     x = cast<F>(I32(dx)) + sk_unaligned_load<F>(iota);
3123     y = cast<F>(I32(dy)) + 0.5f;
3124 }
3125 
3126 STAGE_GG(matrix_translate, const float* m) {
3127     x += m[0];
3128     y += m[1];
3129 }
3130 STAGE_GG(matrix_scale_translate, const float* m) {
3131     x = mad(x,m[0], m[2]);
3132     y = mad(y,m[1], m[3]);
3133 }
3134 STAGE_GG(matrix_2x3, const float* m) {
3135     auto X = mad(x,m[0], mad(y,m[2], m[4])),
3136          Y = mad(x,m[1], mad(y,m[3], m[5]));
3137     x = X;
3138     y = Y;
3139 }
3140 STAGE_GG(matrix_perspective, const float* m) {
3141     // N.B. Unlike the other matrix_ stages, this matrix is row-major.
3142     auto X = mad(x,m[0], mad(y,m[1], m[2])),
3143          Y = mad(x,m[3], mad(y,m[4], m[5])),
3144          Z = mad(x,m[6], mad(y,m[7], m[8]));
3145     x = X * rcp(Z);
3146     y = Y * rcp(Z);
3147 }
3148 
3149 STAGE_PP(uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
3150     r = c->rgba[0];
3151     g = c->rgba[1];
3152     b = c->rgba[2];
3153     a = c->rgba[3];
3154 }
3155 STAGE_PP(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
3156     dr = c->rgba[0];
3157     dg = c->rgba[1];
3158     db = c->rgba[2];
3159     da = c->rgba[3];
3160 }
3161 STAGE_PP(black_color, Ctx::None) { r = g = b =   0; a = 255; }
3162 STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; }
3163 
3164 STAGE_PP(set_rgb, const float rgb[3]) {
3165     r = from_float(rgb[0]);
3166     g = from_float(rgb[1]);
3167     b = from_float(rgb[2]);
3168 }
3169 
3170 STAGE_PP(clamp_0, Ctx::None) { /*definitely a noop*/ }
3171 STAGE_PP(clamp_1, Ctx::None) { /*_should_ be a noop*/ }
3172 
3173 STAGE_PP(clamp_a, Ctx::None) {
3174     r = min(r, a);
3175     g = min(g, a);
3176     b = min(b, a);
3177 }
3178 
3179 STAGE_PP(clamp_gamut, Ctx::None) {
3180     // It shouldn't be possible to get out-of-gamut
3181     // colors when working in lowp.
3182 }
3183 
3184 STAGE_PP(premul, Ctx::None) {
3185     r = div255(r * a);
3186     g = div255(g * a);
3187     b = div255(b * a);
3188 }
3189 STAGE_PP(premul_dst, Ctx::None) {
3190     dr = div255(dr * da);
3191     dg = div255(dg * da);
3192     db = div255(db * da);
3193 }
3194 
3195 STAGE_PP(force_opaque    , Ctx::None) {  a = 255; }
3196 STAGE_PP(force_opaque_dst, Ctx::None) { da = 255; }
3197 
3198 STAGE_PP(swap_rb, Ctx::None) {
3199     auto tmp = r;
3200     r = b;
3201     b = tmp;
3202 }
3203 STAGE_PP(swap_rb_dst, Ctx::None) {
3204     auto tmp = dr;
3205     dr = db;
3206     db = tmp;
3207 }
3208 
3209 STAGE_PP(move_src_dst, Ctx::None) {
3210     dr = r;
3211     dg = g;
3212     db = b;
3213     da = a;
3214 }
3215 
3216 STAGE_PP(move_dst_src, Ctx::None) {
3217     r = dr;
3218     g = dg;
3219     b = db;
3220     a = da;
3221 }
3222 
3223 // ~~~~~~ Blend modes ~~~~~~ //
3224 
3225 // The same logic applied to all 4 channels.
3226 #define BLEND_MODE(name)                                 \
3227     SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
3228     STAGE_PP(name, Ctx::None) {                          \
3229         r = name##_channel(r,dr,a,da);                   \
3230         g = name##_channel(g,dg,a,da);                   \
3231         b = name##_channel(b,db,a,da);                   \
3232         a = name##_channel(a,da,a,da);                   \
3233     }                                                    \
3234     SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
3235 
3236     BLEND_MODE(clear)    { return 0; }
3237     BLEND_MODE(srcatop)  { return div255( s*da + d*inv(sa) ); }
3238     BLEND_MODE(dstatop)  { return div255( d*sa + s*inv(da) ); }
3239     BLEND_MODE(srcin)    { return div255( s*da ); }
3240     BLEND_MODE(dstin)    { return div255( d*sa ); }
3241     BLEND_MODE(srcout)   { return div255( s*inv(da) ); }
3242     BLEND_MODE(dstout)   { return div255( d*inv(sa) ); }
3243     BLEND_MODE(srcover)  { return s + div255( d*inv(sa) ); }
3244     BLEND_MODE(dstover)  { return d + div255( s*inv(da) ); }
3245     BLEND_MODE(modulate) { return div255( s*d ); }
3246     BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); }
3247     BLEND_MODE(plus_)    { return min(s+d, 255); }
3248     BLEND_MODE(screen)   { return s + d - div255( s*d ); }
3249     BLEND_MODE(xor_)     { return div255( s*inv(da) + d*inv(sa) ); }
3250 #undef BLEND_MODE
3251 
3252 // The same logic applied to color, and srcover for alpha.
3253 #define BLEND_MODE(name)                                 \
3254     SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
3255     STAGE_PP(name, Ctx::None) {                          \
3256         r = name##_channel(r,dr,a,da);                   \
3257         g = name##_channel(g,dg,a,da);                   \
3258         b = name##_channel(b,db,a,da);                   \
3259         a = a + div255( da*inv(a) );                     \
3260     }                                                    \
3261     SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
3262 
3263     BLEND_MODE(darken)     { return s + d -   div255( max(s*da, d*sa) ); }
3264     BLEND_MODE(lighten)    { return s + d -   div255( min(s*da, d*sa) ); }
3265     BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); }
3266     BLEND_MODE(exclusion)  { return s + d - 2*div255( s*d ); }
3267 
3268     BLEND_MODE(hardlight) {
3269         return div255( s*inv(da) + d*inv(sa) +
3270                        if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
3271     }
3272     BLEND_MODE(overlay) {
3273         return div255( s*inv(da) + d*inv(sa) +
3274                        if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
3275     }
3276 #undef BLEND_MODE
3277 
3278 // ~~~~~~ Helpers for interacting with memory ~~~~~~ //
3279 
3280 template <typename T>
3281 SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) {
3282     return (T*)ctx->pixels + dy*ctx->stride + dx;
3283 }
3284 
3285 template <typename T>
3286 SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
3287     auto clamp = [](F v, F limit) {
3288         limit = bit_cast<F>( bit_cast<U32>(limit) - 1 );  // Exclusive -> inclusive.
3289         return min(max(0, v), limit);
3290     };
3291     x = clamp(x, ctx->width);
3292     y = clamp(y, ctx->height);
3293 
3294     *ptr = (const T*)ctx->pixels;
3295     return trunc_(y)*ctx->stride + trunc_(x);
3296 }
3297 
3298 template <typename V, typename T>
3299 SI V load(const T* ptr, size_t tail) {
3300     V v = 0;
3301     switch (tail & (N-1)) {
3302         case  0: memcpy(&v, ptr, sizeof(v)); break;
3303     #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
3304         case 15: v[14] = ptr[14];
3305         case 14: v[13] = ptr[13];
3306         case 13: v[12] = ptr[12];
3307         case 12: memcpy(&v, ptr, 12*sizeof(T)); break;
3308         case 11: v[10] = ptr[10];
3309         case 10: v[ 9] = ptr[ 9];
3310         case  9: v[ 8] = ptr[ 8];
3311         case  8: memcpy(&v, ptr,  8*sizeof(T)); break;
3312     #endif
3313         case  7: v[ 6] = ptr[ 6];
3314         case  6: v[ 5] = ptr[ 5];
3315         case  5: v[ 4] = ptr[ 4];
3316         case  4: memcpy(&v, ptr,  4*sizeof(T)); break;
3317         case  3: v[ 2] = ptr[ 2];
3318         case  2: memcpy(&v, ptr,  2*sizeof(T)); break;
3319         case  1: v[ 0] = ptr[ 0];
3320     }
3321     return v;
3322 }
3323 template <typename V, typename T>
3324 SI void store(T* ptr, size_t tail, V v) {
3325     switch (tail & (N-1)) {
3326         case  0: memcpy(ptr, &v, sizeof(v)); break;
3327     #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
3328         case 15: ptr[14] = v[14];
3329         case 14: ptr[13] = v[13];
3330         case 13: ptr[12] = v[12];
3331         case 12: memcpy(ptr, &v, 12*sizeof(T)); break;
3332         case 11: ptr[10] = v[10];
3333         case 10: ptr[ 9] = v[ 9];
3334         case  9: ptr[ 8] = v[ 8];
3335         case  8: memcpy(ptr, &v,  8*sizeof(T)); break;
3336     #endif
3337         case  7: ptr[ 6] = v[ 6];
3338         case  6: ptr[ 5] = v[ 5];
3339         case  5: ptr[ 4] = v[ 4];
3340         case  4: memcpy(ptr, &v,  4*sizeof(T)); break;
3341         case  3: ptr[ 2] = v[ 2];
3342         case  2: memcpy(ptr, &v,  2*sizeof(T)); break;
3343         case  1: ptr[ 0] = v[ 0];
3344     }
3345 }
3346 
3347 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
3348     template <typename V, typename T>
3349     SI V gather(const T* ptr, U32 ix) {
3350         return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
3351                   ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
3352                   ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
3353                   ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
3354     }
3355 
3356     template<>
3357     F gather(const float* ptr, U32 ix) {
3358         __m256i lo, hi;
3359         split(ix, &lo, &hi);
3360 
3361         return join<F>(_mm256_i32gather_ps(ptr, lo, 4),
3362                        _mm256_i32gather_ps(ptr, hi, 4));
3363     }
3364 
3365     template<>
3366     U32 gather(const uint32_t* ptr, U32 ix) {
3367         __m256i lo, hi;
3368         split(ix, &lo, &hi);
3369 
3370         return join<U32>(_mm256_i32gather_epi32(ptr, lo, 4),
3371                          _mm256_i32gather_epi32(ptr, hi, 4));
3372     }
3373 #else
3374     template <typename V, typename T>
3375     SI V gather(const T* ptr, U32 ix) {
3376         return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
3377                   ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
3378     }
3379 #endif
3380 
3381 
3382 // ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
3383 
3384 SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
3385 #if 1 && defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
3386     // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
3387     __m256i _01,_23;
3388     split(rgba, &_01, &_23);
3389     __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20),
3390             _13 = _mm256_permute2x128_si256(_01,_23, 0x31);
3391     rgba = join<U32>(_02, _13);
3392 
3393     auto cast_U16 = [](U32 v) -> U16 {
3394         __m256i _02,_13;
3395         split(v, &_02,&_13);
3396         return _mm256_packus_epi32(_02,_13);
3397     };
3398 #else
3399     auto cast_U16 = [](U32 v) -> U16 {
3400         return cast<U16>(v);
3401     };
3402 #endif
3403     *r = cast_U16(rgba & 65535) & 255;
3404     *g = cast_U16(rgba & 65535) >>  8;
3405     *b = cast_U16(rgba >>   16) & 255;
3406     *a = cast_U16(rgba >>   16) >>  8;
3407 }
3408 
3409 SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
3410 #if 1 && defined(JUMPER_IS_NEON)
3411     uint8x8x4_t rgba;
3412     switch (tail & (N-1)) {
3413         case 0: rgba = vld4_u8     ((const uint8_t*)(ptr+0)         ); break;
3414         case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6);
3415         case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5);
3416         case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4);
3417         case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3);
3418         case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2);
3419         case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1);
3420         case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0);
3421     }
3422     *r = cast<U16>(rgba.val[0]);
3423     *g = cast<U16>(rgba.val[1]);
3424     *b = cast<U16>(rgba.val[2]);
3425     *a = cast<U16>(rgba.val[3]);
3426 #else
3427     from_8888(load<U32>(ptr, tail), r,g,b,a);
3428 #endif
3429 }
3430 SI void store_8888_(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
3431 #if 1 && defined(JUMPER_IS_NEON)
3432     uint8x8x4_t rgba = {{
3433         cast<U8>(r),
3434         cast<U8>(g),
3435         cast<U8>(b),
3436         cast<U8>(a),
3437     }};
3438     switch (tail & (N-1)) {
3439         case 0: vst4_u8     ((uint8_t*)(ptr+0), rgba   ); break;
3440         case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6);
3441         case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5);
3442         case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4);
3443         case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3);
3444         case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2);
3445         case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1);
3446         case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0);
3447     }
3448 #else
3449     store(ptr, tail, cast<U32>(r | (g<<8)) <<  0
3450                    | cast<U32>(b | (a<<8)) << 16);
3451 #endif
3452 }
3453 
3454 STAGE_PP(load_8888, const SkRasterPipeline_MemoryCtx* ctx) {
3455     load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
3456 }
3457 STAGE_PP(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3458     load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
3459 }
3460 STAGE_PP(store_8888, const SkRasterPipeline_MemoryCtx* ctx) {
3461     store_8888_(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a);
3462 }
3463 STAGE_GP(gather_8888, const SkRasterPipeline_GatherCtx* ctx) {
3464     const uint32_t* ptr;
3465     U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3466     from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
3467 }
3468 
3469 // ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
3470 
3471 SI void from_565(U16 rgb, U16* r, U16* g, U16* b) {
3472     // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0
3473     U16 R = (rgb >> 11) & 31,
3474         G = (rgb >>  5) & 63,
3475         B = (rgb >>  0) & 31;
3476 
3477     // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit.
3478     *r = (R << 3) | (R >> 2);
3479     *g = (G << 2) | (G >> 4);
3480     *b = (B << 3) | (B >> 2);
3481 }
3482 SI void load_565_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
3483     from_565(load<U16>(ptr, tail), r,g,b);
3484 }
3485 SI void store_565_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
3486     // Round from [0,255] to [0,31] or [0,63], as if x * (31/255.0f) + 0.5f.
3487     // (Don't feel like you need to find some fundamental truth in these...
3488     // they were brute-force searched.)
3489     U16 R = (r *  9 + 36) / 74,   //  9/74 ≈ 31/255, plus 36/74, about half.
3490         G = (g * 21 + 42) / 85,   // 21/85 = 63/255 exactly.
3491         B = (b *  9 + 36) / 74;
3492     // Pack them back into 15|rrrrr gggggg bbbbb|0.
3493     store(ptr, tail, R << 11
3494                    | G <<  5
3495                    | B <<  0);
3496 }
3497 
3498 STAGE_PP(load_565, const SkRasterPipeline_MemoryCtx* ctx) {
3499     load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b);
3500     a = 255;
3501 }
3502 STAGE_PP(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3503     load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db);
3504     da = 255;
3505 }
3506 STAGE_PP(store_565, const SkRasterPipeline_MemoryCtx* ctx) {
3507     store_565_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b);
3508 }
3509 STAGE_GP(gather_565, const SkRasterPipeline_GatherCtx* ctx) {
3510     const uint16_t* ptr;
3511     U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3512     from_565(gather<U16>(ptr, ix), &r, &g, &b);
3513     a = 255;
3514 }
3515 
3516 SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) {
3517     // Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0.
3518     U16 R = (rgba >> 12) & 15,
3519         G = (rgba >>  8) & 15,
3520         B = (rgba >>  4) & 15,
3521         A = (rgba >>  0) & 15;
3522 
3523     // Scale [0,15] to [0,255].
3524     *r = (R << 4) | R;
3525     *g = (G << 4) | G;
3526     *b = (B << 4) | B;
3527     *a = (A << 4) | A;
3528 }
3529 SI void load_4444_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
3530     from_4444(load<U16>(ptr, tail), r,g,b,a);
3531 }
3532 SI void store_4444_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
3533     // Round from [0,255] to [0,15], producing the same value as (x*(15/255.0f) + 0.5f).
3534     U16 R = (r + 8) / 17,
3535         G = (g + 8) / 17,
3536         B = (b + 8) / 17,
3537         A = (a + 8) / 17;
3538     // Pack them back into 15|rrrr gggg bbbb aaaa|0.
3539     store(ptr, tail, R << 12
3540                    | G <<  8
3541                    | B <<  4
3542                    | A <<  0);
3543 }
3544 
3545 STAGE_PP(load_4444, const SkRasterPipeline_MemoryCtx* ctx) {
3546     load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
3547 }
3548 STAGE_PP(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3549     load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
3550 }
3551 STAGE_PP(store_4444, const SkRasterPipeline_MemoryCtx* ctx) {
3552     store_4444_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a);
3553 }
3554 STAGE_GP(gather_4444, const SkRasterPipeline_GatherCtx* ctx) {
3555     const uint16_t* ptr;
3556     U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3557     from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a);
3558 }
3559 
3560 SI void from_88(U16 rg, U16* r, U16* g) {
3561     *r = (rg & 0xFF);
3562     *g = (rg >> 8);
3563 }
3564 
3565 SI void load_88_(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
3566 #if 1 && defined(JUMPER_IS_NEON)
3567     uint8x8x2_t rg;
3568     switch (tail & (N-1)) {
3569         case 0: rg = vld2_u8     ((const uint8_t*)(ptr+0)         ); break;
3570         case 7: rg = vld2_lane_u8((const uint8_t*)(ptr+6), rg, 6);
3571         case 6: rg = vld2_lane_u8((const uint8_t*)(ptr+5), rg, 5);
3572         case 5: rg = vld2_lane_u8((const uint8_t*)(ptr+4), rg, 4);
3573         case 4: rg = vld2_lane_u8((const uint8_t*)(ptr+3), rg, 3);
3574         case 3: rg = vld2_lane_u8((const uint8_t*)(ptr+2), rg, 2);
3575         case 2: rg = vld2_lane_u8((const uint8_t*)(ptr+1), rg, 1);
3576         case 1: rg = vld2_lane_u8((const uint8_t*)(ptr+0), rg, 0);
3577     }
3578     *r = cast<U16>(rg.val[0]);
3579     *g = cast<U16>(rg.val[1]);
3580 #else
3581     from_88(load<U16>(ptr, tail), r,g);
3582 #endif
3583 }
3584 
3585 SI void store_88_(uint16_t* ptr, size_t tail, U16 r, U16 g) {
3586 #if 1 && defined(JUMPER_IS_NEON)
3587     uint8x8x2_t rg = {{
3588         cast<U8>(r),
3589         cast<U8>(g),
3590     }};
3591     switch (tail & (N-1)) {
3592         case 0: vst2_u8     ((uint8_t*)(ptr+0), rg   ); break;
3593         case 7: vst2_lane_u8((uint8_t*)(ptr+6), rg, 6);
3594         case 6: vst2_lane_u8((uint8_t*)(ptr+5), rg, 5);
3595         case 5: vst2_lane_u8((uint8_t*)(ptr+4), rg, 4);
3596         case 4: vst2_lane_u8((uint8_t*)(ptr+3), rg, 3);
3597         case 3: vst2_lane_u8((uint8_t*)(ptr+2), rg, 2);
3598         case 2: vst2_lane_u8((uint8_t*)(ptr+1), rg, 1);
3599         case 1: vst2_lane_u8((uint8_t*)(ptr+0), rg, 0);
3600     }
3601 #else
3602     store(ptr, tail, cast<U16>(r | (g<<8)) <<  0);
3603 #endif
3604 }
3605 
3606 STAGE_PP(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
3607     b = 0;
3608     a = 255;
3609     load_88_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g);
3610 }
3611 STAGE_PP(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
3612     store_88_(ptr_at_xy<uint16_t>(ctx, dx, dy), tail, r, g);
3613 }
3614 
3615 // ~~~~~~ 8-bit memory loads and stores ~~~~~~ //
3616 
3617 SI U16 load_8(const uint8_t* ptr, size_t tail) {
3618     return cast<U16>(load<U8>(ptr, tail));
3619 }
3620 SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
3621     store(ptr, tail, cast<U8>(v));
3622 }
3623 
3624 STAGE_PP(load_a8, const SkRasterPipeline_MemoryCtx* ctx) {
3625     r = g = b = 0;
3626     a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3627 }
3628 STAGE_PP(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3629     dr = dg = db = 0;
3630     da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3631 }
3632 STAGE_PP(store_a8, const SkRasterPipeline_MemoryCtx* ctx) {
3633     store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a);
3634 }
3635 STAGE_GP(gather_a8, const SkRasterPipeline_GatherCtx* ctx) {
3636     const uint8_t* ptr;
3637     U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3638     r = g = b = 0;
3639     a = cast<U16>(gather<U8>(ptr, ix));
3640 }
3641 
3642 STAGE_PP(alpha_to_gray, Ctx::None) {
3643     r = g = b = a;
3644     a = 255;
3645 }
3646 STAGE_PP(alpha_to_gray_dst, Ctx::None) {
3647     dr = dg = db = da;
3648     da = 255;
3649 }
3650 STAGE_PP(bt709_luminance_or_luma_to_alpha, Ctx::None) {
3651     a = (r*54 + g*183 + b*19)/256;  // 0.2126, 0.7152, 0.0722 with 256 denominator.
3652     r = g = b = 0;
3653 }
3654 
3655 // ~~~~~~ Coverage scales / lerps ~~~~~~ //
3656 
3657 STAGE_PP(load_src, const uint16_t* ptr) {
3658     r = sk_unaligned_load<U16>(ptr + 0*N);
3659     g = sk_unaligned_load<U16>(ptr + 1*N);
3660     b = sk_unaligned_load<U16>(ptr + 2*N);
3661     a = sk_unaligned_load<U16>(ptr + 3*N);
3662 }
3663 STAGE_PP(store_src, uint16_t* ptr) {
3664     sk_unaligned_store(ptr + 0*N, r);
3665     sk_unaligned_store(ptr + 1*N, g);
3666     sk_unaligned_store(ptr + 2*N, b);
3667     sk_unaligned_store(ptr + 3*N, a);
3668 }
3669 STAGE_PP(load_dst, const uint16_t* ptr) {
3670     dr = sk_unaligned_load<U16>(ptr + 0*N);
3671     dg = sk_unaligned_load<U16>(ptr + 1*N);
3672     db = sk_unaligned_load<U16>(ptr + 2*N);
3673     da = sk_unaligned_load<U16>(ptr + 3*N);
3674 }
3675 STAGE_PP(store_dst, uint16_t* ptr) {
3676     sk_unaligned_store(ptr + 0*N, dr);
3677     sk_unaligned_store(ptr + 1*N, dg);
3678     sk_unaligned_store(ptr + 2*N, db);
3679     sk_unaligned_store(ptr + 3*N, da);
3680 }
3681 
3682 // ~~~~~~ Coverage scales / lerps ~~~~~~ //
3683 
3684 STAGE_PP(scale_1_float, const float* f) {
3685     U16 c = from_float(*f);
3686     r = div255( r * c );
3687     g = div255( g * c );
3688     b = div255( b * c );
3689     a = div255( a * c );
3690 }
3691 STAGE_PP(lerp_1_float, const float* f) {
3692     U16 c = from_float(*f);
3693     r = lerp(dr, r, c);
3694     g = lerp(dg, g, c);
3695     b = lerp(db, b, c);
3696     a = lerp(da, a, c);
3697 }
3698 STAGE_PP(lerp_native, const uint16_t scales[]) {
3699     auto c = sk_unaligned_load<U16>(scales);
3700     r = lerp(dr, r, c);
3701     g = lerp(dg, g, c);
3702     b = lerp(db, b, c);
3703     a = lerp(da, a, c);
3704 }
3705 
3706 STAGE_PP(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) {
3707     U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3708     r = div255( r * c );
3709     g = div255( g * c );
3710     b = div255( b * c );
3711     a = div255( a * c );
3712 }
3713 STAGE_PP(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) {
3714     U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3715     r = lerp(dr, r, c);
3716     g = lerp(dg, g, c);
3717     b = lerp(db, b, c);
3718     a = lerp(da, a, c);
3719 }
3720 
3721 // Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
3722 SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) {
3723     return if_then_else(a < da, min(cr,cg,cb)
3724                               , max(cr,cg,cb));
3725 }
3726 STAGE_PP(scale_565, const SkRasterPipeline_MemoryCtx* ctx) {
3727     U16 cr,cg,cb;
3728     load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
3729     U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
3730 
3731     r = div255( r * cr );
3732     g = div255( g * cg );
3733     b = div255( b * cb );
3734     a = div255( a * ca );
3735 }
3736 STAGE_PP(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) {
3737     U16 cr,cg,cb;
3738     load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
3739     U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
3740 
3741     r = lerp(dr, r, cr);
3742     g = lerp(dg, g, cg);
3743     b = lerp(db, b, cb);
3744     a = lerp(da, a, ca);
3745 }
3746 
3747 STAGE_PP(emboss, const SkRasterPipeline_EmbossCtx* ctx) {
3748     U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), tail),
3749         add = load_8(ptr_at_xy<const uint8_t>(&ctx->add, dx,dy), tail);
3750 
3751     r = min(div255(r*mul) + add, a);
3752     g = min(div255(g*mul) + add, a);
3753     b = min(div255(b*mul) + add, a);
3754 }
3755 
3756 
3757 // ~~~~~~ Gradient stages ~~~~~~ //
3758 
3759 // Clamp x to [0,1], both sides inclusive (think, gradients).
3760 // Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
3761 SI F clamp_01(F v) { return min(max(0, v), 1); }
3762 
3763 STAGE_GG(clamp_x_1 , Ctx::None) { x = clamp_01(x); }
3764 STAGE_GG(repeat_x_1, Ctx::None) { x = clamp_01(x - floor_(x)); }
3765 STAGE_GG(mirror_x_1, Ctx::None) {
3766     auto two = [](F x){ return x+x; };
3767     x = clamp_01(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f ));
3768 }
3769 
3770 SI I16 cond_to_mask_16(I32 cond) { return cast<I16>(cond); }
3771 
3772 STAGE_GG(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
3773     auto w = ctx->limit_x;
3774     sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w)));
3775 }
3776 STAGE_GG(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
3777     auto h = ctx->limit_y;
3778     sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h)));
3779 }
3780 STAGE_GG(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
3781     auto w = ctx->limit_x;
3782     auto h = ctx->limit_y;
3783     sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h)));
3784 }
3785 STAGE_PP(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
3786     auto mask = sk_unaligned_load<U16>(ctx->mask);
3787     r = r & mask;
3788     g = g & mask;
3789     b = b & mask;
3790     a = a & mask;
3791 }
3792 
3793 SI void round_F_to_U16(F    R, F    G, F    B, F    A, bool interpolatedInPremul,
3794                        U16* r, U16* g, U16* b, U16* a) {
3795     auto round = [](F x) { return cast<U16>(x * 255.0f + 0.5f); };
3796 
3797     F limit = interpolatedInPremul ? A
3798                                    : 1;
3799     *r = round(min(max(0,R), limit));
3800     *g = round(min(max(0,G), limit));
3801     *b = round(min(max(0,B), limit));
3802     *a = round(A);  // we assume alpha is already in [0,1].
3803 }
3804 
3805 SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t,
3806                         U16* r, U16* g, U16* b, U16* a) {
3807 
3808     F fr, fg, fb, fa, br, bg, bb, ba;
3809 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
3810     if (c->stopCount <=8) {
3811         __m256i lo, hi;
3812         split(idx, &lo, &hi);
3813 
3814         fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), lo),
3815                      _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), hi));
3816         br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), lo),
3817                      _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), hi));
3818         fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), lo),
3819                      _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), hi));
3820         bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), lo),
3821                      _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), hi));
3822         fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), lo),
3823                      _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), hi));
3824         bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), lo),
3825                      _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), hi));
3826         fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), lo),
3827                      _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), hi));
3828         ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), lo),
3829                      _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), hi));
3830     } else
3831 #endif
3832     {
3833         fr = gather<F>(c->fs[0], idx);
3834         fg = gather<F>(c->fs[1], idx);
3835         fb = gather<F>(c->fs[2], idx);
3836         fa = gather<F>(c->fs[3], idx);
3837         br = gather<F>(c->bs[0], idx);
3838         bg = gather<F>(c->bs[1], idx);
3839         bb = gather<F>(c->bs[2], idx);
3840         ba = gather<F>(c->bs[3], idx);
3841     }
3842     round_F_to_U16(mad(t, fr, br),
3843                    mad(t, fg, bg),
3844                    mad(t, fb, bb),
3845                    mad(t, fa, ba),
3846                    c->interpolatedInPremul,
3847                    r,g,b,a);
3848 }
3849 
3850 STAGE_GP(gradient, const SkRasterPipeline_GradientCtx* c) {
3851     auto t = x;
3852     U32 idx = 0;
3853 
3854     // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
3855     for (size_t i = 1; i < c->stopCount; i++) {
3856         idx += if_then_else(t >= c->ts[i], U32(1), U32(0));
3857     }
3858 
3859     gradient_lookup(c, idx, t, &r, &g, &b, &a);
3860 }
3861 
3862 STAGE_GP(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) {
3863     auto t = x;
3864     auto idx = trunc_(t * (c->stopCount-1));
3865     gradient_lookup(c, idx, t, &r, &g, &b, &a);
3866 }
3867 
3868 STAGE_GP(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) {
3869     auto t = x;
3870     round_F_to_U16(mad(t, c->f[0], c->b[0]),
3871                    mad(t, c->f[1], c->b[1]),
3872                    mad(t, c->f[2], c->b[2]),
3873                    mad(t, c->f[3], c->b[3]),
3874                    c->interpolatedInPremul,
3875                    &r,&g,&b,&a);
3876 }
3877 
3878 STAGE_GG(xy_to_unit_angle, Ctx::None) {
3879     F xabs = abs_(x),
3880       yabs = abs_(y);
3881 
3882     F slope = min(xabs, yabs)/max(xabs, yabs);
3883     F s = slope * slope;
3884 
3885     // Use a 7th degree polynomial to approximate atan.
3886     // This was generated using sollya.gforge.inria.fr.
3887     // A float optimized polynomial was generated using the following command.
3888     // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
3889     F phi = slope
3890              * (0.15912117063999176025390625f     + s
3891              * (-5.185396969318389892578125e-2f   + s
3892              * (2.476101927459239959716796875e-2f + s
3893              * (-7.0547382347285747528076171875e-3f))));
3894 
3895     phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi);
3896     phi = if_then_else(x < 0.0f   , 1.0f/2.0f - phi, phi);
3897     phi = if_then_else(y < 0.0f   , 1.0f - phi     , phi);
3898     phi = if_then_else(phi != phi , 0              , phi);  // Check for NaN.
3899     x = phi;
3900 }
3901 STAGE_GG(xy_to_radius, Ctx::None) {
3902     x = sqrt_(x*x + y*y);
3903 }
3904 
3905 // ~~~~~~ Compound stages ~~~~~~ //
3906 
3907 STAGE_PP(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
3908     auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
3909 
3910     load_8888_(ptr, tail, &dr,&dg,&db,&da);
3911     r = r + div255( dr*inv(a) );
3912     g = g + div255( dg*inv(a) );
3913     b = b + div255( db*inv(a) );
3914     a = a + div255( da*inv(a) );
3915     store_8888_(ptr, tail, r,g,b,a);
3916 }
3917 
3918 #if defined(SK_DISABLE_LOWP_BILERP_CLAMP_CLAMP_STAGE)
3919     static void(*bilerp_clamp_8888)(void) = nullptr;
3920     static void(*bilinear)(void) = nullptr;
3921 #else
3922 STAGE_GP(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
3923     // (cx,cy) are the center of our sample.
3924     F cx = x,
3925       cy = y;
3926 
3927     // All sample points are at the same fractional offset (fx,fy).
3928     // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
3929     F fx = fract(cx + 0.5f),
3930       fy = fract(cy + 0.5f);
3931 
3932     // We'll accumulate the color of all four samples into {r,g,b,a} directly.
3933     r = g = b = a = 0;
3934 
3935     // The first three sample points will calculate their area using math
3936     // just like in the float code above, but the fourth will take up all the rest.
3937     //
3938     // Logically this is the same as doing the math for the fourth pixel too,
3939     // but rounding error makes this a better strategy, keeping opaque opaque, etc.
3940     //
3941     // We can keep up to 8 bits of fractional precision without overflowing 16-bit,
3942     // so our "1.0" area is 256.
3943     const uint16_t bias = 256;
3944     U16 remaining = bias;
3945 
3946     for (float dy = -0.5f; dy <= +0.5f; dy += 1.0f)
3947     for (float dx = -0.5f; dx <= +0.5f; dx += 1.0f) {
3948         // (x,y) are the coordinates of this sample point.
3949         F x = cx + dx,
3950           y = cy + dy;
3951 
3952         // ix_and_ptr() will clamp to the image's bounds for us.
3953         const uint32_t* ptr;
3954         U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3955 
3956         U16 sr,sg,sb,sa;
3957         from_8888(gather<U32>(ptr, ix), &sr,&sg,&sb,&sa);
3958 
3959         // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
3960         // are combined in direct proportion to their area overlapping that logical query pixel.
3961         // At positive offsets, the x-axis contribution to that rectangle is fx,
3962         // or (1-fx) at negative x.  Same deal for y.
3963         F sx = (dx > 0) ? fx : 1.0f - fx,
3964           sy = (dy > 0) ? fy : 1.0f - fy;
3965 
3966         U16 area = (dy == 0.5f && dx == 0.5f) ? remaining
3967                                               : cast<U16>(sx * sy * bias);
3968         for (size_t i = 0; i < N; i++) {
3969             SkASSERT(remaining[i] >= area[i]);
3970         }
3971         remaining -= area;
3972 
3973         r += sr * area;
3974         g += sg * area;
3975         b += sb * area;
3976         a += sa * area;
3977     }
3978 
3979     r = (r + bias/2) / bias;
3980     g = (g + bias/2) / bias;
3981     b = (b + bias/2) / bias;
3982     a = (a + bias/2) / bias;
3983 }
3984 
3985 // TODO: lowp::tile() is identical to the highp tile()... share?
3986 SI F tile(F v, SkTileMode mode, float limit, float invLimit) {
3987     // After ix_and_ptr() will clamp the output of tile(), so we need not clamp here.
3988     switch (mode) {
3989         case SkTileMode::kDecal:  // TODO, for now fallthrough to clamp
3990         case SkTileMode::kClamp:  return v;
3991         case SkTileMode::kRepeat: return v - floor_(v*invLimit)*limit;
3992         case SkTileMode::kMirror:
3993             return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit );
3994     }
3995     SkUNREACHABLE;
3996 }
3997 
3998 SI void sample(const SkRasterPipeline_SamplerCtx2* ctx, F x, F y,
3999                U16* r, U16* g, U16* b, U16* a) {
4000     x = tile(x, ctx->tileX, ctx->width , ctx->invWidth );
4001     y = tile(y, ctx->tileY, ctx->height, ctx->invHeight);
4002 
4003     switch (ctx->ct) {
4004         default: *r = *g = *b = *a = 0;  // TODO
4005                  break;
4006 
4007         case kRGBA_8888_SkColorType:
4008         case kBGRA_8888_SkColorType: {
4009             const uint32_t* ptr;
4010             U32 ix = ix_and_ptr(&ptr, ctx, x,y);
4011             from_8888(gather<U32>(ptr, ix), r,g,b,a);
4012             if (ctx->ct == kBGRA_8888_SkColorType) {
4013                 std::swap(*r,*b);
4014             }
4015         } break;
4016     }
4017 }
4018 
4019 template <int D>
4020 SI void sampler(const SkRasterPipeline_SamplerCtx2* ctx,
4021                 F cx, F cy, const F (&wx)[D], const F (&wy)[D],
4022                 U16* r, U16* g, U16* b, U16* a) {
4023 
4024     float start = -0.5f*(D-1);
4025 
4026     const uint16_t bias = 256;
4027     U16 remaining = bias;
4028 
4029     *r = *g = *b = *a = 0;
4030     F y = cy + start;
4031     for (int j = 0; j < D; j++, y += 1.0f) {
4032         F x = cx + start;
4033         for (int i = 0; i < D; i++, x += 1.0f) {
4034             U16 R,G,B,A;
4035             sample(ctx, x,y, &R,&G,&B,&A);
4036 
4037             U16 w = (i == D-1 && j == D-1) ? remaining
4038                                            : cast<U16>(wx[i]*wy[j]*bias);
4039             remaining -= w;
4040             *r += w*R;
4041             *g += w*G;
4042             *b += w*B;
4043             *a += w*A;
4044         }
4045     }
4046     *r = (*r + bias/2) / bias;
4047     *g = (*g + bias/2) / bias;
4048     *b = (*b + bias/2) / bias;
4049     *a = (*a + bias/2) / bias;
4050 }
4051 
4052 STAGE_GP(bilinear, const SkRasterPipeline_SamplerCtx2* ctx) {
4053     F fx = fract(x + 0.5f),
4054       fy = fract(y + 0.5f);
4055     const F wx[] = {1.0f - fx, fx};
4056     const F wy[] = {1.0f - fy, fy};
4057 
4058     sampler(ctx, x,y, wx,wy, &r,&g,&b,&a);
4059 }
4060 #endif
4061 
4062 // ~~~~~~ GrSwizzle stage ~~~~~~ //
4063 
4064 STAGE_PP(swizzle, void* ctx) {
4065     auto ir = r, ig = g, ib = b, ia = a;
4066     U16* o[] = {&r, &g, &b, &a};
4067     char swiz[4];
4068     memcpy(swiz, &ctx, sizeof(swiz));
4069 
4070     for (int i = 0; i < 4; ++i) {
4071         switch (swiz[i]) {
4072             case 'r': *o[i] = ir;       break;
4073             case 'g': *o[i] = ig;       break;
4074             case 'b': *o[i] = ib;       break;
4075             case 'a': *o[i] = ia;       break;
4076             case '0': *o[i] = U16(0);   break;
4077             case '1': *o[i] = U16(255); break;
4078             default:                    break;
4079         }
4080     }
4081 }
4082 
4083 // Now we'll add null stand-ins for stages we haven't implemented in lowp.
4084 // If a pipeline uses these stages, it'll boot it out of lowp into highp.
4085 #define NOT_IMPLEMENTED(st) static void (*st)(void) = nullptr;
4086     NOT_IMPLEMENTED(callback)
4087     NOT_IMPLEMENTED(interpreter)
4088     NOT_IMPLEMENTED(unbounded_set_rgb)
4089     NOT_IMPLEMENTED(unbounded_uniform_color)
4090     NOT_IMPLEMENTED(unpremul)
4091     NOT_IMPLEMENTED(dither)  // TODO
4092     NOT_IMPLEMENTED(from_srgb)
4093     NOT_IMPLEMENTED(to_srgb)
4094     NOT_IMPLEMENTED(load_16161616)
4095     NOT_IMPLEMENTED(store_16161616)
4096     NOT_IMPLEMENTED(load_a16)
4097     NOT_IMPLEMENTED(store_a16)
4098     NOT_IMPLEMENTED(load_rg1616)
4099     NOT_IMPLEMENTED(store_rg1616)
4100     NOT_IMPLEMENTED(load_f16)
4101     NOT_IMPLEMENTED(load_f16_dst)
4102     NOT_IMPLEMENTED(store_f16)
4103     NOT_IMPLEMENTED(gather_f16)
4104     NOT_IMPLEMENTED(load_af16)
4105     NOT_IMPLEMENTED(store_af16)
4106     NOT_IMPLEMENTED(load_rgf16)
4107     NOT_IMPLEMENTED(store_rgf16)
4108     NOT_IMPLEMENTED(load_f32)
4109     NOT_IMPLEMENTED(load_f32_dst)
4110     NOT_IMPLEMENTED(store_f32)
4111     NOT_IMPLEMENTED(gather_f32)
4112     NOT_IMPLEMENTED(load_rgf32)
4113     NOT_IMPLEMENTED(store_rgf32)
4114     NOT_IMPLEMENTED(load_1010102)
4115     NOT_IMPLEMENTED(load_1010102_dst)
4116     NOT_IMPLEMENTED(store_1010102)
4117     NOT_IMPLEMENTED(gather_1010102)
4118     NOT_IMPLEMENTED(store_u16_be)
4119     NOT_IMPLEMENTED(byte_tables)  // TODO
4120     NOT_IMPLEMENTED(colorburn)
4121     NOT_IMPLEMENTED(colordodge)
4122     NOT_IMPLEMENTED(softlight)
4123     NOT_IMPLEMENTED(hue)
4124     NOT_IMPLEMENTED(saturation)
4125     NOT_IMPLEMENTED(color)
4126     NOT_IMPLEMENTED(luminosity)
4127     NOT_IMPLEMENTED(matrix_3x3)
4128     NOT_IMPLEMENTED(matrix_3x4)
4129     NOT_IMPLEMENTED(matrix_4x5)  // TODO
4130     NOT_IMPLEMENTED(matrix_4x3)  // TODO
4131     NOT_IMPLEMENTED(parametric)
4132     NOT_IMPLEMENTED(gamma_)
4133     NOT_IMPLEMENTED(rgb_to_hsl)
4134     NOT_IMPLEMENTED(hsl_to_rgb)
4135     NOT_IMPLEMENTED(gauss_a_to_rgba)  // TODO
4136     NOT_IMPLEMENTED(mirror_x)         // TODO
4137     NOT_IMPLEMENTED(repeat_x)         // TODO
4138     NOT_IMPLEMENTED(mirror_y)         // TODO
4139     NOT_IMPLEMENTED(repeat_y)         // TODO
4140     NOT_IMPLEMENTED(negate_x)
4141     NOT_IMPLEMENTED(bicubic)  // TODO if I can figure out negative weights
4142     NOT_IMPLEMENTED(bicubic_clamp_8888)
4143     NOT_IMPLEMENTED(bilinear_nx)      // TODO
4144     NOT_IMPLEMENTED(bilinear_ny)      // TODO
4145     NOT_IMPLEMENTED(bilinear_px)      // TODO
4146     NOT_IMPLEMENTED(bilinear_py)      // TODO
4147     NOT_IMPLEMENTED(bicubic_n3x)      // TODO
4148     NOT_IMPLEMENTED(bicubic_n1x)      // TODO
4149     NOT_IMPLEMENTED(bicubic_p1x)      // TODO
4150     NOT_IMPLEMENTED(bicubic_p3x)      // TODO
4151     NOT_IMPLEMENTED(bicubic_n3y)      // TODO
4152     NOT_IMPLEMENTED(bicubic_n1y)      // TODO
4153     NOT_IMPLEMENTED(bicubic_p1y)      // TODO
4154     NOT_IMPLEMENTED(bicubic_p3y)      // TODO
4155     NOT_IMPLEMENTED(save_xy)          // TODO
4156     NOT_IMPLEMENTED(accumulate)       // TODO
4157     NOT_IMPLEMENTED(xy_to_2pt_conical_well_behaved)
4158     NOT_IMPLEMENTED(xy_to_2pt_conical_strip)
4159     NOT_IMPLEMENTED(xy_to_2pt_conical_focal_on_circle)
4160     NOT_IMPLEMENTED(xy_to_2pt_conical_smaller)
4161     NOT_IMPLEMENTED(xy_to_2pt_conical_greater)
4162     NOT_IMPLEMENTED(alter_2pt_conical_compensate_focal)
4163     NOT_IMPLEMENTED(alter_2pt_conical_unswap)
4164     NOT_IMPLEMENTED(mask_2pt_conical_nan)
4165     NOT_IMPLEMENTED(mask_2pt_conical_degenerates)
4166     NOT_IMPLEMENTED(apply_vector_mask)
4167 #undef NOT_IMPLEMENTED
4168 
4169 #endif//defined(JUMPER_IS_SCALAR) controlling whether we build lowp stages
4170 }  // namespace lowp
4171 
4172 }  // namespace SK_OPTS_NS
4173 
4174 #endif//SkRasterPipeline_opts_DEFINED
4175