1 /*
2 * Copyright 2015 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #ifndef SkNx_neon_DEFINED
9 #define SkNx_neon_DEFINED
10
11 #include <arm_neon.h>
12
13 namespace { // NOLINT(google-build-namespaces)
14
15 // ARMv8 has vrndm(q)_f32 to floor floats. Here we emulate it:
16 // - roundtrip through integers via truncation
17 // - subtract 1 if that's too big (possible for negative values).
18 // This restricts the domain of our inputs to a maximum somehwere around 2^31. Seems plenty big.
emulate_vrndmq_f32(float32x4_t v)19 AI static float32x4_t emulate_vrndmq_f32(float32x4_t v) {
20 auto roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
21 auto too_big = vcgtq_f32(roundtrip, v);
22 return vsubq_f32(roundtrip, (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1)));
23 }
emulate_vrndm_f32(float32x2_t v)24 AI static float32x2_t emulate_vrndm_f32(float32x2_t v) {
25 auto roundtrip = vcvt_f32_s32(vcvt_s32_f32(v));
26 auto too_big = vcgt_f32(roundtrip, v);
27 return vsub_f32(roundtrip, (float32x2_t)vand_u32(too_big, (uint32x2_t)vdup_n_f32(1)));
28 }
29
30 template <>
31 class SkNx<2, float> {
32 public:
SkNx(float32x2_t vec)33 AI SkNx(float32x2_t vec) : fVec(vec) {}
34
SkNx()35 AI SkNx() {}
SkNx(float val)36 AI SkNx(float val) : fVec(vdup_n_f32(val)) {}
SkNx(float a,float b)37 AI SkNx(float a, float b) { fVec = (float32x2_t) { a, b }; }
38
Load(const void * ptr)39 AI static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); }
store(void * ptr)40 AI void store(void* ptr) const { vst1_f32((float*)ptr, fVec); }
41
Load2(const void * ptr,SkNx * x,SkNx * y)42 AI static void Load2(const void* ptr, SkNx* x, SkNx* y) {
43 float32x2x2_t xy = vld2_f32((const float*) ptr);
44 *x = xy.val[0];
45 *y = xy.val[1];
46 }
47
Store2(void * dst,const SkNx & a,const SkNx & b)48 AI static void Store2(void* dst, const SkNx& a, const SkNx& b) {
49 float32x2x2_t ab = {{
50 a.fVec,
51 b.fVec,
52 }};
53 vst2_f32((float*) dst, ab);
54 }
55
Store3(void * dst,const SkNx & a,const SkNx & b,const SkNx & c)56 AI static void Store3(void* dst, const SkNx& a, const SkNx& b, const SkNx& c) {
57 float32x2x3_t abc = {{
58 a.fVec,
59 b.fVec,
60 c.fVec,
61 }};
62 vst3_f32((float*) dst, abc);
63 }
64
Store4(void * dst,const SkNx & a,const SkNx & b,const SkNx & c,const SkNx & d)65 AI static void Store4(void* dst, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
66 float32x2x4_t abcd = {{
67 a.fVec,
68 b.fVec,
69 c.fVec,
70 d.fVec,
71 }};
72 vst4_f32((float*) dst, abcd);
73 }
74
75 AI SkNx operator - () const { return vneg_f32(fVec); }
76
77 AI SkNx operator + (const SkNx& o) const { return vadd_f32(fVec, o.fVec); }
78 AI SkNx operator - (const SkNx& o) const { return vsub_f32(fVec, o.fVec); }
79 AI SkNx operator * (const SkNx& o) const { return vmul_f32(fVec, o.fVec); }
80 AI SkNx operator / (const SkNx& o) const {
81 #if defined(SK_CPU_ARM64)
82 return vdiv_f32(fVec, o.fVec);
83 #else
84 float32x2_t est0 = vrecpe_f32(o.fVec),
85 est1 = vmul_f32(vrecps_f32(est0, o.fVec), est0),
86 est2 = vmul_f32(vrecps_f32(est1, o.fVec), est1);
87 return vmul_f32(fVec, est2);
88 #endif
89 }
90
91 AI SkNx operator==(const SkNx& o) const { return vreinterpret_f32_u32(vceq_f32(fVec, o.fVec)); }
92 AI SkNx operator <(const SkNx& o) const { return vreinterpret_f32_u32(vclt_f32(fVec, o.fVec)); }
93 AI SkNx operator >(const SkNx& o) const { return vreinterpret_f32_u32(vcgt_f32(fVec, o.fVec)); }
94 AI SkNx operator<=(const SkNx& o) const { return vreinterpret_f32_u32(vcle_f32(fVec, o.fVec)); }
95 AI SkNx operator>=(const SkNx& o) const { return vreinterpret_f32_u32(vcge_f32(fVec, o.fVec)); }
96 AI SkNx operator!=(const SkNx& o) const {
97 return vreinterpret_f32_u32(vmvn_u32(vceq_f32(fVec, o.fVec)));
98 }
99
Min(const SkNx & l,const SkNx & r)100 AI static SkNx Min(const SkNx& l, const SkNx& r) { return vmin_f32(l.fVec, r.fVec); }
Max(const SkNx & l,const SkNx & r)101 AI static SkNx Max(const SkNx& l, const SkNx& r) { return vmax_f32(l.fVec, r.fVec); }
102
abs()103 AI SkNx abs() const { return vabs_f32(fVec); }
floor()104 AI SkNx floor() const {
105 #if defined(SK_CPU_ARM64)
106 return vrndm_f32(fVec);
107 #else
108 return emulate_vrndm_f32(fVec);
109 #endif
110 }
111
sqrt()112 AI SkNx sqrt() const {
113 #if defined(SK_CPU_ARM64)
114 return vsqrt_f32(fVec);
115 #else
116 float32x2_t est0 = vrsqrte_f32(fVec),
117 est1 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0),
118 est2 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1);
119 return vmul_f32(fVec, est2);
120 #endif
121 }
122
123 AI float operator[](int k) const {
124 SkASSERT(0 <= k && k < 2);
125 union { float32x2_t v; float fs[2]; } pun = {fVec};
126 return pun.fs[k&1];
127 }
128
allTrue()129 AI bool allTrue() const {
130 #if defined(SK_CPU_ARM64)
131 return 0 != vminv_u32(vreinterpret_u32_f32(fVec));
132 #else
133 auto v = vreinterpret_u32_f32(fVec);
134 return vget_lane_u32(v,0) && vget_lane_u32(v,1);
135 #endif
136 }
anyTrue()137 AI bool anyTrue() const {
138 #if defined(SK_CPU_ARM64)
139 return 0 != vmaxv_u32(vreinterpret_u32_f32(fVec));
140 #else
141 auto v = vreinterpret_u32_f32(fVec);
142 return vget_lane_u32(v,0) || vget_lane_u32(v,1);
143 #endif
144 }
145
thenElse(const SkNx & t,const SkNx & e)146 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
147 return vbsl_f32(vreinterpret_u32_f32(fVec), t.fVec, e.fVec);
148 }
149
150 float32x2_t fVec;
151 };
152
153 template <>
154 class SkNx<4, float> {
155 public:
SkNx(float32x4_t vec)156 AI SkNx(float32x4_t vec) : fVec(vec) {}
157
SkNx()158 AI SkNx() {}
SkNx(float val)159 AI SkNx(float val) : fVec(vdupq_n_f32(val)) {}
SkNx(float a,float b,float c,float d)160 AI SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }
161
Load(const void * ptr)162 AI static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); }
store(void * ptr)163 AI void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); }
164
Load2(const void * ptr,SkNx * x,SkNx * y)165 AI static void Load2(const void* ptr, SkNx* x, SkNx* y) {
166 float32x4x2_t xy = vld2q_f32((const float*) ptr);
167 *x = xy.val[0];
168 *y = xy.val[1];
169 }
170
Load4(const void * ptr,SkNx * r,SkNx * g,SkNx * b,SkNx * a)171 AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
172 float32x4x4_t rgba = vld4q_f32((const float*) ptr);
173 *r = rgba.val[0];
174 *g = rgba.val[1];
175 *b = rgba.val[2];
176 *a = rgba.val[3];
177 }
Store4(void * dst,const SkNx & r,const SkNx & g,const SkNx & b,const SkNx & a)178 AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
179 float32x4x4_t rgba = {{
180 r.fVec,
181 g.fVec,
182 b.fVec,
183 a.fVec,
184 }};
185 vst4q_f32((float*) dst, rgba);
186 }
187
188 AI SkNx operator - () const { return vnegq_f32(fVec); }
189
190 AI SkNx operator + (const SkNx& o) const { return vaddq_f32(fVec, o.fVec); }
191 AI SkNx operator - (const SkNx& o) const { return vsubq_f32(fVec, o.fVec); }
192 AI SkNx operator * (const SkNx& o) const { return vmulq_f32(fVec, o.fVec); }
193 AI SkNx operator / (const SkNx& o) const {
194 #if defined(SK_CPU_ARM64)
195 return vdivq_f32(fVec, o.fVec);
196 #else
197 float32x4_t est0 = vrecpeq_f32(o.fVec),
198 est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0),
199 est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1);
200 return vmulq_f32(fVec, est2);
201 #endif
202 }
203
204 AI SkNx operator==(const SkNx& o) const {return vreinterpretq_f32_u32(vceqq_f32(fVec, o.fVec));}
205 AI SkNx operator <(const SkNx& o) const {return vreinterpretq_f32_u32(vcltq_f32(fVec, o.fVec));}
206 AI SkNx operator >(const SkNx& o) const {return vreinterpretq_f32_u32(vcgtq_f32(fVec, o.fVec));}
207 AI SkNx operator<=(const SkNx& o) const {return vreinterpretq_f32_u32(vcleq_f32(fVec, o.fVec));}
208 AI SkNx operator>=(const SkNx& o) const {return vreinterpretq_f32_u32(vcgeq_f32(fVec, o.fVec));}
209 AI SkNx operator!=(const SkNx& o) const {
210 return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec)));
211 }
212
Min(const SkNx & l,const SkNx & r)213 AI static SkNx Min(const SkNx& l, const SkNx& r) { return vminq_f32(l.fVec, r.fVec); }
Max(const SkNx & l,const SkNx & r)214 AI static SkNx Max(const SkNx& l, const SkNx& r) { return vmaxq_f32(l.fVec, r.fVec); }
215
abs()216 AI SkNx abs() const { return vabsq_f32(fVec); }
floor()217 AI SkNx floor() const {
218 #if defined(SK_CPU_ARM64)
219 return vrndmq_f32(fVec);
220 #else
221 return emulate_vrndmq_f32(fVec);
222 #endif
223 }
224
225
sqrt()226 AI SkNx sqrt() const {
227 #if defined(SK_CPU_ARM64)
228 return vsqrtq_f32(fVec);
229 #else
230 float32x4_t est0 = vrsqrteq_f32(fVec),
231 est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0),
232 est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
233 return vmulq_f32(fVec, est2);
234 #endif
235 }
236
237 AI float operator[](int k) const {
238 SkASSERT(0 <= k && k < 4);
239 union { float32x4_t v; float fs[4]; } pun = {fVec};
240 return pun.fs[k&3];
241 }
242
min()243 AI float min() const {
244 #if defined(SK_CPU_ARM64)
245 return vminvq_f32(fVec);
246 #else
247 SkNx min = Min(*this, vrev64q_f32(fVec));
248 return std::min(min[0], min[2]);
249 #endif
250 }
251
max()252 AI float max() const {
253 #if defined(SK_CPU_ARM64)
254 return vmaxvq_f32(fVec);
255 #else
256 SkNx max = Max(*this, vrev64q_f32(fVec));
257 return std::max(max[0], max[2]);
258 #endif
259 }
260
allTrue()261 AI bool allTrue() const {
262 #if defined(SK_CPU_ARM64)
263 return 0 != vminvq_u32(vreinterpretq_u32_f32(fVec));
264 #else
265 auto v = vreinterpretq_u32_f32(fVec);
266 return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1)
267 && vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3);
268 #endif
269 }
anyTrue()270 AI bool anyTrue() const {
271 #if defined(SK_CPU_ARM64)
272 return 0 != vmaxvq_u32(vreinterpretq_u32_f32(fVec));
273 #else
274 auto v = vreinterpretq_u32_f32(fVec);
275 return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1)
276 || vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3);
277 #endif
278 }
279
thenElse(const SkNx & t,const SkNx & e)280 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
281 return vbslq_f32(vreinterpretq_u32_f32(fVec), t.fVec, e.fVec);
282 }
283
284 float32x4_t fVec;
285 };
286
287 #if defined(SK_CPU_ARM64)
SkNx_fma(const Sk4f & f,const Sk4f & m,const Sk4f & a)288 AI static Sk4f SkNx_fma(const Sk4f& f, const Sk4f& m, const Sk4f& a) {
289 return vfmaq_f32(a.fVec, f.fVec, m.fVec);
290 }
291 #endif
292
293 // It's possible that for our current use cases, representing this as
294 // half a uint16x8_t might be better than representing it as a uint16x4_t.
295 // It'd make conversion to Sk4b one step simpler.
296 template <>
297 class SkNx<4, uint16_t> {
298 public:
SkNx(const uint16x4_t & vec)299 AI SkNx(const uint16x4_t& vec) : fVec(vec) {}
300
SkNx()301 AI SkNx() {}
SkNx(uint16_t val)302 AI SkNx(uint16_t val) : fVec(vdup_n_u16(val)) {}
SkNx(uint16_t a,uint16_t b,uint16_t c,uint16_t d)303 AI SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
304 fVec = (uint16x4_t) { a,b,c,d };
305 }
306
Load(const void * ptr)307 AI static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); }
store(void * ptr)308 AI void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); }
309
Load4(const void * ptr,SkNx * r,SkNx * g,SkNx * b,SkNx * a)310 AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
311 uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr);
312 *r = rgba.val[0];
313 *g = rgba.val[1];
314 *b = rgba.val[2];
315 *a = rgba.val[3];
316 }
Load3(const void * ptr,SkNx * r,SkNx * g,SkNx * b)317 AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) {
318 uint16x4x3_t rgba = vld3_u16((const uint16_t*)ptr);
319 *r = rgba.val[0];
320 *g = rgba.val[1];
321 *b = rgba.val[2];
322 }
Store4(void * dst,const SkNx & r,const SkNx & g,const SkNx & b,const SkNx & a)323 AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
324 uint16x4x4_t rgba = {{
325 r.fVec,
326 g.fVec,
327 b.fVec,
328 a.fVec,
329 }};
330 vst4_u16((uint16_t*) dst, rgba);
331 }
332
333 AI SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); }
334 AI SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); }
335 AI SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); }
336 AI SkNx operator & (const SkNx& o) const { return vand_u16(fVec, o.fVec); }
337 AI SkNx operator | (const SkNx& o) const { return vorr_u16(fVec, o.fVec); }
338
339 AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
340 AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
341
Min(const SkNx & a,const SkNx & b)342 AI static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fVec); }
343
344 AI uint16_t operator[](int k) const {
345 SkASSERT(0 <= k && k < 4);
346 union { uint16x4_t v; uint16_t us[4]; } pun = {fVec};
347 return pun.us[k&3];
348 }
349
thenElse(const SkNx & t,const SkNx & e)350 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
351 return vbsl_u16(fVec, t.fVec, e.fVec);
352 }
353
354 uint16x4_t fVec;
355 };
356
357 template <>
358 class SkNx<8, uint16_t> {
359 public:
SkNx(const uint16x8_t & vec)360 AI SkNx(const uint16x8_t& vec) : fVec(vec) {}
361
SkNx()362 AI SkNx() {}
SkNx(uint16_t val)363 AI SkNx(uint16_t val) : fVec(vdupq_n_u16(val)) {}
Load(const void * ptr)364 AI static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr); }
365
SkNx(uint16_t a,uint16_t b,uint16_t c,uint16_t d,uint16_t e,uint16_t f,uint16_t g,uint16_t h)366 AI SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
367 uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
368 fVec = (uint16x8_t) { a,b,c,d, e,f,g,h };
369 }
370
store(void * ptr)371 AI void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); }
372
373 AI SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); }
374 AI SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); }
375 AI SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); }
376 AI SkNx operator & (const SkNx& o) const { return vandq_u16(fVec, o.fVec); }
377 AI SkNx operator | (const SkNx& o) const { return vorrq_u16(fVec, o.fVec); }
378
379 AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
380 AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
381
Min(const SkNx & a,const SkNx & b)382 AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.fVec); }
383
384 AI uint16_t operator[](int k) const {
385 SkASSERT(0 <= k && k < 8);
386 union { uint16x8_t v; uint16_t us[8]; } pun = {fVec};
387 return pun.us[k&7];
388 }
389
mulHi(const SkNx & m)390 AI SkNx mulHi(const SkNx& m) const {
391 uint32x4_t hi = vmull_u16(vget_high_u16(fVec), vget_high_u16(m.fVec));
392 uint32x4_t lo = vmull_u16( vget_low_u16(fVec), vget_low_u16(m.fVec));
393
394 return { vcombine_u16(vshrn_n_u32(lo,16), vshrn_n_u32(hi,16)) };
395 }
396
thenElse(const SkNx & t,const SkNx & e)397 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
398 return vbslq_u16(fVec, t.fVec, e.fVec);
399 }
400
401 uint16x8_t fVec;
402 };
403
404 template <>
405 class SkNx<4, uint8_t> {
406 public:
407 typedef uint32_t __attribute__((aligned(1))) unaligned_uint32_t;
408
SkNx(const uint8x8_t & vec)409 AI SkNx(const uint8x8_t& vec) : fVec(vec) {}
410
SkNx()411 AI SkNx() {}
SkNx(uint8_t a,uint8_t b,uint8_t c,uint8_t d)412 AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
413 fVec = (uint8x8_t){a,b,c,d, 0,0,0,0};
414 }
Load(const void * ptr)415 AI static SkNx Load(const void* ptr) {
416 return (uint8x8_t)vld1_dup_u32((const unaligned_uint32_t*)ptr);
417 }
store(void * ptr)418 AI void store(void* ptr) const {
419 return vst1_lane_u32((unaligned_uint32_t*)ptr, (uint32x2_t)fVec, 0);
420 }
421 AI uint8_t operator[](int k) const {
422 SkASSERT(0 <= k && k < 4);
423 union { uint8x8_t v; uint8_t us[8]; } pun = {fVec};
424 return pun.us[k&3];
425 }
426
427 // TODO as needed
428
429 uint8x8_t fVec;
430 };
431
432 template <>
433 class SkNx<8, uint8_t> {
434 public:
SkNx(const uint8x8_t & vec)435 AI SkNx(const uint8x8_t& vec) : fVec(vec) {}
436
SkNx()437 AI SkNx() {}
SkNx(uint8_t val)438 AI SkNx(uint8_t val) : fVec(vdup_n_u8(val)) {}
SkNx(uint8_t a,uint8_t b,uint8_t c,uint8_t d,uint8_t e,uint8_t f,uint8_t g,uint8_t h)439 AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
440 uint8_t e, uint8_t f, uint8_t g, uint8_t h) {
441 fVec = (uint8x8_t) { a,b,c,d, e,f,g,h };
442 }
443
Load(const void * ptr)444 AI static SkNx Load(const void* ptr) { return vld1_u8((const uint8_t*)ptr); }
store(void * ptr)445 AI void store(void* ptr) const { vst1_u8((uint8_t*)ptr, fVec); }
446
447 AI uint8_t operator[](int k) const {
448 SkASSERT(0 <= k && k < 8);
449 union { uint8x8_t v; uint8_t us[8]; } pun = {fVec};
450 return pun.us[k&7];
451 }
452
453 uint8x8_t fVec;
454 };
455
456 template <>
457 class SkNx<16, uint8_t> {
458 public:
SkNx(const uint8x16_t & vec)459 AI SkNx(const uint8x16_t& vec) : fVec(vec) {}
460
SkNx()461 AI SkNx() {}
SkNx(uint8_t val)462 AI SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {}
SkNx(uint8_t a,uint8_t b,uint8_t c,uint8_t d,uint8_t e,uint8_t f,uint8_t g,uint8_t h,uint8_t i,uint8_t j,uint8_t k,uint8_t l,uint8_t m,uint8_t n,uint8_t o,uint8_t p)463 AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
464 uint8_t e, uint8_t f, uint8_t g, uint8_t h,
465 uint8_t i, uint8_t j, uint8_t k, uint8_t l,
466 uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
467 fVec = (uint8x16_t) { a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p };
468 }
469
Load(const void * ptr)470 AI static SkNx Load(const void* ptr) { return vld1q_u8((const uint8_t*)ptr); }
store(void * ptr)471 AI void store(void* ptr) const { vst1q_u8((uint8_t*)ptr, fVec); }
472
saturatedAdd(const SkNx & o)473 AI SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); }
474
475 AI SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); }
476 AI SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); }
477 AI SkNx operator & (const SkNx& o) const { return vandq_u8(fVec, o.fVec); }
478
Min(const SkNx & a,const SkNx & b)479 AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fVec); }
480 AI SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); }
481
482 AI uint8_t operator[](int k) const {
483 SkASSERT(0 <= k && k < 16);
484 union { uint8x16_t v; uint8_t us[16]; } pun = {fVec};
485 return pun.us[k&15];
486 }
487
thenElse(const SkNx & t,const SkNx & e)488 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
489 return vbslq_u8(fVec, t.fVec, e.fVec);
490 }
491
492 uint8x16_t fVec;
493 };
494
495 template <>
496 class SkNx<4, int32_t> {
497 public:
SkNx(const int32x4_t & vec)498 AI SkNx(const int32x4_t& vec) : fVec(vec) {}
499
SkNx()500 AI SkNx() {}
SkNx(int32_t v)501 AI SkNx(int32_t v) {
502 fVec = vdupq_n_s32(v);
503 }
SkNx(int32_t a,int32_t b,int32_t c,int32_t d)504 AI SkNx(int32_t a, int32_t b, int32_t c, int32_t d) {
505 fVec = (int32x4_t){a,b,c,d};
506 }
Load(const void * ptr)507 AI static SkNx Load(const void* ptr) {
508 return vld1q_s32((const int32_t*)ptr);
509 }
store(void * ptr)510 AI void store(void* ptr) const {
511 return vst1q_s32((int32_t*)ptr, fVec);
512 }
513 AI int32_t operator[](int k) const {
514 SkASSERT(0 <= k && k < 4);
515 union { int32x4_t v; int32_t is[4]; } pun = {fVec};
516 return pun.is[k&3];
517 }
518
519 AI SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); }
520 AI SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); }
521 AI SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); }
522
523 AI SkNx operator & (const SkNx& o) const { return vandq_s32(fVec, o.fVec); }
524 AI SkNx operator | (const SkNx& o) const { return vorrq_s32(fVec, o.fVec); }
525 AI SkNx operator ^ (const SkNx& o) const { return veorq_s32(fVec, o.fVec); }
526
527 AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
528 AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
529
530 AI SkNx operator == (const SkNx& o) const {
531 return vreinterpretq_s32_u32(vceqq_s32(fVec, o.fVec));
532 }
533 AI SkNx operator < (const SkNx& o) const {
534 return vreinterpretq_s32_u32(vcltq_s32(fVec, o.fVec));
535 }
536 AI SkNx operator > (const SkNx& o) const {
537 return vreinterpretq_s32_u32(vcgtq_s32(fVec, o.fVec));
538 }
539
Min(const SkNx & a,const SkNx & b)540 AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.fVec); }
Max(const SkNx & a,const SkNx & b)541 AI static SkNx Max(const SkNx& a, const SkNx& b) { return vmaxq_s32(a.fVec, b.fVec); }
542 // TODO as needed
543
thenElse(const SkNx & t,const SkNx & e)544 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
545 return vbslq_s32(vreinterpretq_u32_s32(fVec), t.fVec, e.fVec);
546 }
547
abs()548 AI SkNx abs() const { return vabsq_s32(fVec); }
549
550 int32x4_t fVec;
551 };
552
553 template <>
554 class SkNx<4, uint32_t> {
555 public:
SkNx(const uint32x4_t & vec)556 AI SkNx(const uint32x4_t& vec) : fVec(vec) {}
557
SkNx()558 AI SkNx() {}
SkNx(uint32_t v)559 AI SkNx(uint32_t v) {
560 fVec = vdupq_n_u32(v);
561 }
SkNx(uint32_t a,uint32_t b,uint32_t c,uint32_t d)562 AI SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
563 fVec = (uint32x4_t){a,b,c,d};
564 }
Load(const void * ptr)565 AI static SkNx Load(const void* ptr) {
566 return vld1q_u32((const uint32_t*)ptr);
567 }
store(void * ptr)568 AI void store(void* ptr) const {
569 return vst1q_u32((uint32_t*)ptr, fVec);
570 }
571 AI uint32_t operator[](int k) const {
572 SkASSERT(0 <= k && k < 4);
573 union { uint32x4_t v; uint32_t us[4]; } pun = {fVec};
574 return pun.us[k&3];
575 }
576
577 AI SkNx operator + (const SkNx& o) const { return vaddq_u32(fVec, o.fVec); }
578 AI SkNx operator - (const SkNx& o) const { return vsubq_u32(fVec, o.fVec); }
579 AI SkNx operator * (const SkNx& o) const { return vmulq_u32(fVec, o.fVec); }
580
581 AI SkNx operator & (const SkNx& o) const { return vandq_u32(fVec, o.fVec); }
582 AI SkNx operator | (const SkNx& o) const { return vorrq_u32(fVec, o.fVec); }
583 AI SkNx operator ^ (const SkNx& o) const { return veorq_u32(fVec, o.fVec); }
584
585 AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
586 AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
587
588 AI SkNx operator == (const SkNx& o) const { return vceqq_u32(fVec, o.fVec); }
589 AI SkNx operator < (const SkNx& o) const { return vcltq_u32(fVec, o.fVec); }
590 AI SkNx operator > (const SkNx& o) const { return vcgtq_u32(fVec, o.fVec); }
591
Min(const SkNx & a,const SkNx & b)592 AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.fVec); }
593 // TODO as needed
594
mulHi(const SkNx & m)595 AI SkNx mulHi(const SkNx& m) const {
596 uint64x2_t hi = vmull_u32(vget_high_u32(fVec), vget_high_u32(m.fVec));
597 uint64x2_t lo = vmull_u32( vget_low_u32(fVec), vget_low_u32(m.fVec));
598
599 return { vcombine_u32(vshrn_n_u64(lo,32), vshrn_n_u64(hi,32)) };
600 }
601
thenElse(const SkNx & t,const SkNx & e)602 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
603 return vbslq_u32(fVec, t.fVec, e.fVec);
604 }
605
606 uint32x4_t fVec;
607 };
608
609 template<> AI /*static*/ Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) {
610 return vcvtq_s32_f32(src.fVec);
611
612 }
613 template<> AI /*static*/ Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) {
614 return vcvtq_f32_s32(src.fVec);
615 }
616 template<> AI /*static*/ Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) {
617 return SkNx_cast<float>(Sk4i::Load(&src));
618 }
619
620 template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
621 return vqmovn_u32(vcvtq_u32_f32(src.fVec));
622 }
623
624 template<> AI /*static*/ Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) {
625 return vcvtq_f32_u32(vmovl_u16(src.fVec));
626 }
627
628 template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
629 uint32x4_t _32 = vcvtq_u32_f32(src.fVec);
630 uint16x4_t _16 = vqmovn_u32(_32);
631 return vqmovn_u16(vcombine_u16(_16, _16));
632 }
633
634 template<> AI /*static*/ Sk4u SkNx_cast<uint32_t, uint8_t>(const Sk4b& src) {
635 uint16x8_t _16 = vmovl_u8(src.fVec);
636 return vmovl_u16(vget_low_u16(_16));
637 }
638
639 template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint8_t>(const Sk4b& src) {
640 return vreinterpretq_s32_u32(SkNx_cast<uint32_t>(src).fVec);
641 }
642
643 template<> AI /*static*/ Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
644 return vcvtq_f32_s32(SkNx_cast<int32_t>(src).fVec);
645 }
646
647 template<> AI /*static*/ Sk16b SkNx_cast<uint8_t, float>(const Sk16f& src) {
648 Sk8f ab, cd;
649 SkNx_split(src, &ab, &cd);
650
651 Sk4f a,b,c,d;
652 SkNx_split(ab, &a, &b);
653 SkNx_split(cd, &c, &d);
654 return vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec),
655 (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0],
656 vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec),
657 (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0]).val[0];
658 }
659
660 template<> AI /*static*/ Sk8b SkNx_cast<uint8_t, int32_t>(const Sk8i& src) {
661 Sk4i a, b;
662 SkNx_split(src, &a, &b);
663 uint16x4_t a16 = vqmovun_s32(a.fVec);
664 uint16x4_t b16 = vqmovun_s32(b.fVec);
665
666 return vqmovn_u16(vcombine_u16(a16, b16));
667 }
668
669 template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {
670 return vget_low_u16(vmovl_u8(src.fVec));
671 }
672
673 template<> AI /*static*/ Sk8h SkNx_cast<uint16_t, uint8_t>(const Sk8b& src) {
674 return vmovl_u8(src.fVec);
675 }
676
677 template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {
678 return vmovn_u16(vcombine_u16(src.fVec, src.fVec));
679 }
680
681 template<> AI /*static*/ Sk8b SkNx_cast<uint8_t, uint16_t>(const Sk8h& src) {
682 return vqmovn_u16(src.fVec);
683 }
684
685 template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) {
686 uint16x4_t _16 = vqmovun_s32(src.fVec);
687 return vqmovn_u16(vcombine_u16(_16, _16));
688 }
689
690 template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint32_t>(const Sk4u& src) {
691 uint16x4_t _16 = vqmovn_u32(src.fVec);
692 return vqmovn_u16(vcombine_u16(_16, _16));
693 }
694
695 template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint16_t>(const Sk4h& src) {
696 return vreinterpretq_s32_u32(vmovl_u16(src.fVec));
697 }
698
699 template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, int32_t>(const Sk4i& src) {
700 return vmovn_u32(vreinterpretq_u32_s32(src.fVec));
701 }
702
703 template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint32_t>(const Sk4u& src) {
704 return vreinterpretq_s32_u32(src.fVec);
705 }
706
Sk4f_round(const Sk4f & x)707 AI static Sk4i Sk4f_round(const Sk4f& x) {
708 return vcvtq_s32_f32((x + 0.5f).fVec);
709 }
710
711 } // namespace
712
713 #endif//SkNx_neon_DEFINED
714