1 /*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 /*
9 ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; and ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q
10 */
11
12 #ifndef SkBlend_opts_DEFINED
13 #define SkBlend_opts_DEFINED
14
15 #include "SkNx.h"
16 #include "SkPM4fPriv.h"
17
18 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
19 #include <immintrin.h>
20 #endif
21
22 namespace SK_OPTS_NS {
23
srcover_srgb_srgb_1(uint32_t * dst,uint32_t src)24 static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) {
25 if (src >= 0xFF000000) {
26 *dst = src;
27 return;
28 }
29 auto d = Sk4f_fromS32(*dst),
30 s = Sk4f_fromS32( src);
31 *dst = Sk4f_toS32(s + d * (1.0f - s[3]));
32 }
33
srcover_srgb_srgb_4(uint32_t * dst,const uint32_t * src)34 static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) {
35 srcover_srgb_srgb_1(dst++, *src++);
36 srcover_srgb_srgb_1(dst++, *src++);
37 srcover_srgb_srgb_1(dst++, *src++);
38 srcover_srgb_srgb_1(dst , *src );
39 }
40
41 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
42
load(const uint32_t * p)43 static inline __m128i load(const uint32_t* p) {
44 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
45 }
46
store(uint32_t * p,__m128i v)47 static inline void store(uint32_t* p, __m128i v) {
48 _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v);
49 }
50
51 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
52
srcover_srgb_srgb(uint32_t * dst,const uint32_t * const srcStart,int ndst,const int nsrc)53 static void srcover_srgb_srgb(
54 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {
55 const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
56 while (ndst > 0) {
57 int count = SkTMin(ndst, nsrc);
58 ndst -= count;
59 const uint32_t* src = srcStart;
60 const uint32_t* end = dst + (count & ~3);
61 ptrdiff_t delta = src - dst;
62
63 while (dst < end) {
64 __m128i pixels = load(src);
65 if (_mm_testc_si128(pixels, alphaMask)) {
66 uint32_t* start = dst;
67 do {
68 store(dst, pixels);
69 dst += 4;
70 } while (dst < end
71 && _mm_testc_si128(pixels = load(dst + delta), alphaMask));
72 src += dst - start;
73 } else if (_mm_testz_si128(pixels, alphaMask)) {
74 do {
75 dst += 4;
76 src += 4;
77 } while (dst < end
78 && _mm_testz_si128(pixels = load(src), alphaMask));
79 } else {
80 uint32_t* start = dst;
81 do {
82 srcover_srgb_srgb_4(dst, dst + delta);
83 dst += 4;
84 } while (dst < end
85 && _mm_testnzc_si128(pixels = load(dst + delta), alphaMask));
86 src += dst - start;
87 }
88 }
89
90 count = count & 3;
91 while (count-- > 0) {
92 srcover_srgb_srgb_1(dst++, *src++);
93 }
94 }
95 }
96 #else
97 // SSE2 versions
98
99 // Note: In the next three comparisons a group of 4 pixels is converted to a group of
100 // "signed" pixels because the sse2 does not have an unsigned comparison.
101 // Make it so that we can use the signed comparison operators by biasing
102 // 0x00xxxxxx to 0x80xxxxxxx which is the smallest values and biasing 0xffxxxxxx to
103 // 0x7fxxxxxx which is the largest set of values.
check_opaque_alphas(__m128i pixels)104 static inline bool check_opaque_alphas(__m128i pixels) {
105 __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
106 int mask =
107 _mm_movemask_epi8(
108 _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000)));
109 return mask == 0;
110 }
111
check_transparent_alphas(__m128i pixels)112 static inline bool check_transparent_alphas(__m128i pixels) {
113 __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
114 int mask =
115 _mm_movemask_epi8(
116 _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF)));
117 return mask == 0;
118 }
119
check_partial_alphas(__m128i pixels)120 static inline bool check_partial_alphas(__m128i pixels) {
121 __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
122 __m128i opaque = _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000));
123 __m128i transparent = _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF));
124 int mask = _mm_movemask_epi8(_mm_xor_si128(opaque, transparent));
125 return mask == 0;
126 }
127
srcover_srgb_srgb(uint32_t * dst,const uint32_t * const srcStart,int ndst,const int nsrc)128 static void srcover_srgb_srgb(
129 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {
130 while (ndst > 0) {
131 int count = SkTMin(ndst, nsrc);
132 ndst -= count;
133 const uint32_t* src = srcStart;
134 const uint32_t* end = dst + (count & ~3);
135 const ptrdiff_t delta = src - dst;
136
137 __m128i pixels = load(src);
138 do {
139 if (check_opaque_alphas(pixels)) {
140 uint32_t* start = dst;
141 do {
142 store(dst, pixels);
143 dst += 4;
144 } while (dst < end && check_opaque_alphas((pixels = load(dst + delta))));
145 src += dst - start;
146 } else if (check_transparent_alphas(pixels)) {
147 const uint32_t* start = dst;
148 do {
149 dst += 4;
150 } while (dst < end && check_transparent_alphas(pixels = load(dst + delta)));
151 src += dst - start;
152 } else {
153 const uint32_t* start = dst;
154 do {
155 srcover_srgb_srgb_4(dst, dst + delta);
156 dst += 4;
157 } while (dst < end && check_partial_alphas(pixels = load(dst + delta)));
158 src += dst - start;
159 }
160 } while (dst < end);
161
162 count = count & 3;
163 while (count-- > 0) {
164 srcover_srgb_srgb_1(dst++, *src++);
165 }
166 }
167 }
168 #endif
169 #else
170
srcover_srgb_srgb(uint32_t * dst,const uint32_t * const src,int ndst,const int nsrc)171 static void srcover_srgb_srgb(
172 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
173 while (ndst > 0) {
174 int n = SkTMin(ndst, nsrc);
175
176 for (int i = 0; i < n; i++) {
177 srcover_srgb_srgb_1(dst++, src[i]);
178 }
179 ndst -= n;
180 }
181 }
182
183 #endif
184
185 } // namespace SK_OPTS_NS
186
187 #endif//SkBlend_opts_DEFINED
188