• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2015 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #ifndef SkNx_sse_DEFINED
9 #define SkNx_sse_DEFINED
10 
11 // This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything more recent.
12 // If you do, make sure this is in a static inline function... anywhere else risks violating ODR.
13 
14 #define SKNX_IS_FAST
15 
16 // SSE 4.1 has _mm_floor_ps to floor 4 floats.  We emulate it:
17 //   - roundtrip through integers via truncation
18 //   - subtract 1 if that's too big (possible for negative values).
19 // This restricts the domain of our inputs to a maximum somehwere around 2^31.  Seems plenty big.
sse2_mm_floor_ps(__m128 v)20 static inline __m128 sse2_mm_floor_ps(__m128 v) {
21     __m128 roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
22     __m128 too_big = _mm_cmpgt_ps(roundtrip, v);
23     return _mm_sub_ps(roundtrip, _mm_and_ps(too_big, _mm_set1_ps(1.0f)));
24 }
25 
26 template <>
27 class SkNx<2, float> {
28 public:
SkNx(const __m128 & vec)29     SkNx(const __m128& vec) : fVec(vec) {}
30 
SkNx()31     SkNx() {}
SkNx(float val)32     SkNx(float val) : fVec(_mm_set1_ps(val)) {}
Load(const void * ptr)33     static SkNx Load(const void* ptr) {
34         return _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)ptr));
35     }
SkNx(float a,float b)36     SkNx(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {}
37 
store(void * ptr)38     void store(void* ptr) const { _mm_storel_pi((__m64*)ptr, fVec); }
39 
40     SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); }
41     SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); }
42     SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); }
43     SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); }
44 
45     SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); }
46     SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); }
47     SkNx operator  < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); }
48     SkNx operator  > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); }
49     SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); }
50     SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); }
51 
Min(const SkNx & l,const SkNx & r)52     static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); }
Max(const SkNx & l,const SkNx & r)53     static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); }
54 
sqrt()55     SkNx  sqrt () const { return _mm_sqrt_ps (fVec);  }
rsqrt0()56     SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); }
rsqrt1()57     SkNx rsqrt1() const { return this->rsqrt0(); }
rsqrt2()58     SkNx rsqrt2() const { return this->rsqrt1(); }
59 
invert()60     SkNx       invert() const { return SkNx(1) / *this; }
approxInvert()61     SkNx approxInvert() const { return _mm_rcp_ps(fVec); }
62 
63     float operator[](int k) const {
64         SkASSERT(0 <= k && k < 2);
65         union { __m128 v; float fs[4]; } pun = {fVec};
66         return pun.fs[k&1];
67     }
68 
allTrue()69     bool allTrue() const { return 0xff == (_mm_movemask_epi8(_mm_castps_si128(fVec)) & 0xff); }
anyTrue()70     bool anyTrue() const { return 0x00 != (_mm_movemask_epi8(_mm_castps_si128(fVec)) & 0xff); }
71 
72     __m128 fVec;
73 };
74 
75 template <>
76 class SkNx<4, float> {
77 public:
SkNx(const __m128 & vec)78     SkNx(const __m128& vec) : fVec(vec) {}
79 
SkNx()80     SkNx() {}
SkNx(float val)81     SkNx(float val)           : fVec( _mm_set1_ps(val) ) {}
Load(const void * ptr)82     static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr); }
83 
SkNx(float a,float b,float c,float d)84     SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {}
85 
store(void * ptr)86     void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); }
87 
88     SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); }
89     SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); }
90     SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); }
91     SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); }
92 
93     SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); }
94     SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); }
95     SkNx operator  < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); }
96     SkNx operator  > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); }
97     SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); }
98     SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); }
99 
Min(const SkNx & l,const SkNx & r)100     static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); }
Max(const SkNx & l,const SkNx & r)101     static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); }
102 
abs()103     SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); }
floor()104     SkNx floor() const { return sse2_mm_floor_ps(fVec); }
105 
sqrt()106     SkNx  sqrt () const { return _mm_sqrt_ps (fVec);  }
rsqrt0()107     SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); }
rsqrt1()108     SkNx rsqrt1() const { return this->rsqrt0(); }
rsqrt2()109     SkNx rsqrt2() const { return this->rsqrt1(); }
110 
invert()111     SkNx       invert() const { return SkNx(1) / *this; }
approxInvert()112     SkNx approxInvert() const { return _mm_rcp_ps(fVec); }
113 
114     float operator[](int k) const {
115         SkASSERT(0 <= k && k < 4);
116         union { __m128 v; float fs[4]; } pun = {fVec};
117         return pun.fs[k&3];
118     }
119 
allTrue()120     bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castps_si128(fVec)); }
anyTrue()121     bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(fVec)); }
122 
thenElse(const SkNx & t,const SkNx & e)123     SkNx thenElse(const SkNx& t, const SkNx& e) const {
124         return _mm_or_ps(_mm_and_ps   (fVec, t.fVec),
125                          _mm_andnot_ps(fVec, e.fVec));
126     }
127 
128     __m128 fVec;
129 };
130 
131 template <>
132 class SkNx<4, int> {
133 public:
SkNx(const __m128i & vec)134     SkNx(const __m128i& vec) : fVec(vec) {}
135 
SkNx()136     SkNx() {}
SkNx(int val)137     SkNx(int val) : fVec(_mm_set1_epi32(val)) {}
Load(const void * ptr)138     static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
SkNx(int a,int b,int c,int d)139     SkNx(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {}
140 
store(void * ptr)141     void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
142 
143     SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }
144     SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }
145     SkNx operator * (const SkNx& o) const {
146         __m128i mul20 = _mm_mul_epu32(fVec, o.fVec),
147                 mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.fVec, 4));
148         return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)),
149                                   _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)));
150     }
151 
152     SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
153     SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); }
154 
155     int operator[](int k) const {
156         SkASSERT(0 <= k && k < 4);
157         union { __m128i v; int is[4]; } pun = {fVec};
158         return pun.is[k&3];
159     }
160 
161     __m128i fVec;
162 };
163 
164 template <>
165 class SkNx<4, uint16_t> {
166 public:
SkNx(const __m128i & vec)167     SkNx(const __m128i& vec) : fVec(vec) {}
168 
SkNx()169     SkNx() {}
SkNx(uint16_t val)170     SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {}
Load(const void * ptr)171     static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); }
SkNx(uint16_t a,uint16_t b,uint16_t c,uint16_t d)172     SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a,b,c,d,0,0,0,0)) {}
173 
store(void * ptr)174     void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); }
175 
176     SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); }
177     SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); }
178     SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); }
179 
180     SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); }
181     SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); }
182 
183     uint16_t operator[](int k) const {
184         SkASSERT(0 <= k && k < 4);
185         union { __m128i v; uint16_t us[8]; } pun = {fVec};
186         return pun.us[k&3];
187     }
188 
189     __m128i fVec;
190 };
191 
192 template <>
193 class SkNx<8, uint16_t> {
194 public:
SkNx(const __m128i & vec)195     SkNx(const __m128i& vec) : fVec(vec) {}
196 
SkNx()197     SkNx() {}
SkNx(uint16_t val)198     SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {}
Load(const void * ptr)199     static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
SkNx(uint16_t a,uint16_t b,uint16_t c,uint16_t d,uint16_t e,uint16_t f,uint16_t g,uint16_t h)200     SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
201          uint16_t e, uint16_t f, uint16_t g, uint16_t h) : fVec(_mm_setr_epi16(a,b,c,d,e,f,g,h)) {}
202 
store(void * ptr)203     void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
204 
205     SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); }
206     SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); }
207     SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); }
208 
209     SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); }
210     SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); }
211 
Min(const SkNx & a,const SkNx & b)212     static SkNx Min(const SkNx& a, const SkNx& b) {
213         // No unsigned _mm_min_epu16, so we'll shift into a space where we can use the
214         // signed version, _mm_min_epi16, then shift back.
215         const uint16_t top = 0x8000; // Keep this separate from _mm_set1_epi16 or MSVC will whine.
216         const __m128i top_8x = _mm_set1_epi16(top);
217         return _mm_add_epi8(top_8x, _mm_min_epi16(_mm_sub_epi8(a.fVec, top_8x),
218                                                   _mm_sub_epi8(b.fVec, top_8x)));
219     }
220 
thenElse(const SkNx & t,const SkNx & e)221     SkNx thenElse(const SkNx& t, const SkNx& e) const {
222         return _mm_or_si128(_mm_and_si128   (fVec, t.fVec),
223                             _mm_andnot_si128(fVec, e.fVec));
224     }
225 
226     uint16_t operator[](int k) const {
227         SkASSERT(0 <= k && k < 8);
228         union { __m128i v; uint16_t us[8]; } pun = {fVec};
229         return pun.us[k&7];
230     }
231 
232     __m128i fVec;
233 };
234 
235 template <>
236 class SkNx<4, uint8_t> {
237 public:
SkNx(const __m128i & vec)238     SkNx(const __m128i& vec) : fVec(vec) {}
239 
SkNx()240     SkNx() {}
Load(const void * ptr)241     static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)ptr); }
store(void * ptr)242     void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); }
243 
244     // TODO as needed
245 
246     __m128i fVec;
247 };
248 
249 template <>
250 class SkNx<16, uint8_t> {
251 public:
SkNx(const __m128i & vec)252     SkNx(const __m128i& vec) : fVec(vec) {}
253 
SkNx()254     SkNx() {}
SkNx(uint8_t val)255     SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {}
Load(const void * ptr)256     static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
SkNx(uint8_t a,uint8_t b,uint8_t c,uint8_t d,uint8_t e,uint8_t f,uint8_t g,uint8_t h,uint8_t i,uint8_t j,uint8_t k,uint8_t l,uint8_t m,uint8_t n,uint8_t o,uint8_t p)257     SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
258          uint8_t e, uint8_t f, uint8_t g, uint8_t h,
259          uint8_t i, uint8_t j, uint8_t k, uint8_t l,
260          uint8_t m, uint8_t n, uint8_t o, uint8_t p)
261         : fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p)) {}
262 
store(void * ptr)263     void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
264 
saturatedAdd(const SkNx & o)265     SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec); }
266 
267     SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); }
268     SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); }
269 
Min(const SkNx & a,const SkNx & b)270     static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec, b.fVec); }
271     SkNx operator < (const SkNx& o) const {
272         // There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use a signed compare.
273         auto flip = _mm_set1_epi8(char(0x80));
274         return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.fVec));
275     }
276 
277     uint8_t operator[](int k) const {
278         SkASSERT(0 <= k && k < 16);
279         union { __m128i v; uint8_t us[16]; } pun = {fVec};
280         return pun.us[k&15];
281     }
282 
thenElse(const SkNx & t,const SkNx & e)283     SkNx thenElse(const SkNx& t, const SkNx& e) const {
284         return _mm_or_si128(_mm_and_si128   (fVec, t.fVec),
285                             _mm_andnot_si128(fVec, e.fVec));
286     }
287 
288     __m128i fVec;
289 };
290 
291 template<> /*static*/ inline Sk4f SkNx_cast<float, int>(const Sk4i& src) {
292     return _mm_cvtepi32_ps(src.fVec);
293 }
294 
295 template <> /*static*/ inline Sk4i SkNx_cast<int, float>(const Sk4f& src) {
296     return _mm_cvttps_epi32(src.fVec);
297 }
298 
299 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
300     auto _32 = _mm_cvttps_epi32(src.fVec);
301     // Ideally we'd use _mm_packus_epi32 here.  But that's SSE4.1+.
302 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
303     // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place.
304     const int _ = ~0;
305     return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_));
306 #else
307     // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32:
308     _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000));
309     return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000));
310 #endif
311 }
312 
313 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
314     auto _32 = _mm_cvttps_epi32(src.fVec);
315 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
316     const int _ = ~0;
317     return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_,_));
318 #else
319     auto _16 = _mm_packus_epi16(_32, _32);
320     return     _mm_packus_epi16(_16, _16);
321 #endif
322 }
323 
324 template<> /*static*/ inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
325 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
326     const int _ = ~0;
327     auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_));
328 #else
329     auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()),
330          _32 = _mm_unpacklo_epi16(_16,     _mm_setzero_si128());
331 #endif
332     return _mm_cvtepi32_ps(_32);
333 }
334 
335 template<> /*static*/ inline Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) {
336     auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());
337     return _mm_cvtepi32_ps(_32);
338 }
339 
Sk4f_ToBytes(uint8_t bytes[16],const Sk4f & a,const Sk4f & b,const Sk4f & c,const Sk4f & d)340 static inline void Sk4f_ToBytes(uint8_t bytes[16],
341                                 const Sk4f& a, const Sk4f& b, const Sk4f& c, const Sk4f& d) {
342     _mm_storeu_si128((__m128i*)bytes,
343                      _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec),
344                                                        _mm_cvttps_epi32(b.fVec)),
345                                       _mm_packus_epi16(_mm_cvttps_epi32(c.fVec),
346                                                        _mm_cvttps_epi32(d.fVec))));
347 }
348 
349 template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {
350     return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128());
351 }
352 
353 template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {
354     return _mm_packus_epi16(src.fVec, src.fVec);
355 }
356 
357 #endif//SkNx_sse_DEFINED
358