• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2016 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 /*
9 ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; and ./out/Release/nanobench  --samples 300 --nompd --match LinearSrcOver -q
10  */
11 
12 #ifndef SkBlend_opts_DEFINED
13 #define SkBlend_opts_DEFINED
14 
15 #include "SkNx.h"
16 #include "SkPM4fPriv.h"
17 
18 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
19     #include <immintrin.h>
20 #endif
21 
22 namespace SK_OPTS_NS {
23 
srcover_srgb_srgb_1(uint32_t * dst,uint32_t src)24 static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) {
25     if (src >= 0xFF000000) {
26         *dst = src;
27         return;
28     }
29     auto d = Sk4f_fromS32(*dst),
30          s = Sk4f_fromS32( src);
31     *dst = Sk4f_toS32(s + d * (1.0f - s[3]));
32 }
33 
srcover_srgb_srgb_4(uint32_t * dst,const uint32_t * src)34 static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) {
35     srcover_srgb_srgb_1(dst++, *src++);
36     srcover_srgb_srgb_1(dst++, *src++);
37     srcover_srgb_srgb_1(dst++, *src++);
38     srcover_srgb_srgb_1(dst  , *src  );
39 }
40 
41 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
42 
load(const uint32_t * p)43     static inline __m128i load(const uint32_t* p) {
44         return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
45     }
46 
store(uint32_t * p,__m128i v)47     static inline void store(uint32_t* p, __m128i v) {
48         _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v);
49     }
50 
51     #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
52 
srcover_srgb_srgb(uint32_t * dst,const uint32_t * const srcStart,int ndst,const int nsrc)53         static void srcover_srgb_srgb(
54             uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {
55             const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
56             while (ndst > 0) {
57                 int count = SkTMin(ndst, nsrc);
58                 ndst -= count;
59                 const uint32_t* src = srcStart;
60                 const uint32_t* end = dst + (count & ~3);
61                 ptrdiff_t delta = src - dst;
62 
63                 while (dst < end) {
64                     __m128i pixels = load(src);
65                     if (_mm_testc_si128(pixels, alphaMask)) {
66                          uint32_t* start = dst;
67                         do {
68                             store(dst, pixels);
69                             dst += 4;
70                         } while (dst < end
71                                  && _mm_testc_si128(pixels = load(dst + delta), alphaMask));
72                         src += dst - start;
73                     } else if (_mm_testz_si128(pixels, alphaMask)) {
74                         do {
75                             dst += 4;
76                             src += 4;
77                         } while (dst < end
78                                  && _mm_testz_si128(pixels = load(src), alphaMask));
79                     } else {
80                         uint32_t* start = dst;
81                         do {
82                             srcover_srgb_srgb_4(dst, dst + delta);
83                             dst += 4;
84                         } while (dst < end
85                                  && _mm_testnzc_si128(pixels = load(dst + delta), alphaMask));
86                         src += dst - start;
87                     }
88                 }
89 
90                 count = count & 3;
91                 while (count-- > 0) {
92                     srcover_srgb_srgb_1(dst++, *src++);
93                 }
94             }
95         }
96     #else
97     // SSE2 versions
98 
99         // Note: In the next three comparisons a group of 4 pixels is converted to a group of
100         // "signed" pixels because the sse2 does not have an unsigned comparison.
101         // Make it so that we can use the signed comparison operators by biasing
102         // 0x00xxxxxx to 0x80xxxxxxx which is the smallest values and biasing 0xffxxxxxx to
103         // 0x7fxxxxxx which is the largest set of values.
check_opaque_alphas(__m128i pixels)104         static inline bool check_opaque_alphas(__m128i pixels) {
105             __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
106             int mask =
107                 _mm_movemask_epi8(
108                     _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000)));
109             return mask == 0;
110         }
111 
check_transparent_alphas(__m128i pixels)112         static inline bool check_transparent_alphas(__m128i pixels) {
113             __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
114             int mask =
115                 _mm_movemask_epi8(
116                     _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF)));
117             return mask == 0;
118         }
119 
check_partial_alphas(__m128i pixels)120         static inline bool check_partial_alphas(__m128i pixels) {
121             __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
122             __m128i opaque       = _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000));
123             __m128i transparent  = _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF));
124             int mask             = _mm_movemask_epi8(_mm_xor_si128(opaque, transparent));
125             return mask == 0;
126         }
127 
srcover_srgb_srgb(uint32_t * dst,const uint32_t * const srcStart,int ndst,const int nsrc)128         static void srcover_srgb_srgb(
129             uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {
130             while (ndst > 0) {
131                 int count = SkTMin(ndst, nsrc);
132                 ndst -= count;
133                 const uint32_t* src = srcStart;
134                 const uint32_t* end = dst + (count & ~3);
135                 const ptrdiff_t delta = src - dst;
136 
137                 __m128i pixels = load(src);
138                 do {
139                     if (check_opaque_alphas(pixels)) {
140                         uint32_t* start = dst;
141                         do {
142                             store(dst, pixels);
143                             dst += 4;
144                         } while (dst < end && check_opaque_alphas((pixels = load(dst + delta))));
145                         src += dst - start;
146                     } else if (check_transparent_alphas(pixels)) {
147                         const uint32_t* start = dst;
148                         do {
149                             dst += 4;
150                         } while (dst < end && check_transparent_alphas(pixels = load(dst + delta)));
151                         src += dst - start;
152                     } else {
153                         const uint32_t* start = dst;
154                         do {
155                             srcover_srgb_srgb_4(dst, dst + delta);
156                             dst += 4;
157                         } while (dst < end && check_partial_alphas(pixels = load(dst + delta)));
158                         src += dst - start;
159                     }
160                 } while (dst < end);
161 
162                 count = count & 3;
163                 while (count-- > 0) {
164                     srcover_srgb_srgb_1(dst++, *src++);
165                 }
166             }
167         }
168     #endif
169 #else
170 
srcover_srgb_srgb(uint32_t * dst,const uint32_t * const src,int ndst,const int nsrc)171     static void srcover_srgb_srgb(
172         uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
173         while (ndst > 0) {
174             int n = SkTMin(ndst, nsrc);
175 
176             for (int i = 0; i < n; i++) {
177                 srcover_srgb_srgb_1(dst++, src[i]);
178             }
179             ndst -= n;
180         }
181     }
182 
183 #endif
184 
185 }  // namespace SK_OPTS_NS
186 
187 #endif//SkBlend_opts_DEFINED
188