1 /*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #ifndef SkBitmapProcState_opts_DEFINED
9 #define SkBitmapProcState_opts_DEFINED
10
11 #include "src/core/SkBitmapProcState.h"
12
13 // SkBitmapProcState optimized Shader, Sample, or Matrix procs.
14 //
15 // Only S32_alpha_D32_filter_DX exploits instructions beyond
16 // our common baseline SSE2/NEON instruction sets, so that's
17 // all that lives here.
18 //
19 // The rest are scattershot at the moment but I want to get them
20 // all migrated to be normal code inside SkBitmapProcState.cpp.
21
22 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
23 #include <immintrin.h>
24 #elif defined(SK_ARM_HAS_NEON)
25 #include <arm_neon.h>
26 #endif
27
28 namespace SK_OPTS_NS {
29
30 // This same basic packing scheme is used throughout the file.
decode_packed_coordinates_and_weight(uint32_t packed,int * v0,int * v1,int * w)31 static void decode_packed_coordinates_and_weight(uint32_t packed, int* v0, int* v1, int* w) {
32 // The top 14 bits are the integer coordinate x0 or y0.
33 *v0 = packed >> 18;
34
35 // The bottom 14 bits are the integer coordinate x1 or y1.
36 *v1 = packed & 0x3fff;
37
38 // The middle 4 bits are the interpolating factor between the two, i.e. the weight for v1.
39 *w = (packed >> 14) & 0xf;
40 }
41
42 #if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
43
44 // As above, 4x.
decode_packed_coordinates_and_weight(__m128i packed,int v0[4],int v1[4],__m128i * w)45 static void decode_packed_coordinates_and_weight(__m128i packed,
46 int v0[4], int v1[4], __m128i* w) {
47 _mm_storeu_si128((__m128i*)v0, _mm_srli_epi32(packed, 18));
48 _mm_storeu_si128((__m128i*)v1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
49 *w = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf));
50 }
51
52 // This is the crux of the SSSE3 implementation,
53 // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().
interpolate_in_x(uint32_t A0,uint32_t A1,uint32_t B0,uint32_t B1,const __m128i & interlaced_x_weights)54 static inline __m128i interpolate_in_x(uint32_t A0, uint32_t A1,
55 uint32_t B0, uint32_t B1,
56 const __m128i& interlaced_x_weights) {
57 // _mm_maddubs_epi16() is a little idiosyncratic, but very helpful as the core of a lerp.
58 //
59 // It takes two arguments interlaced byte-wise:
60 // - first arg: [ x,y, ... 7 more pairs of 8-bit values ...]
61 // - second arg: [ z,w, ... 7 more pairs of 8-bit values ...]
62 // and returns 8 16-bit values: [ x*z + y*w, ... 7 more 16-bit values ... ].
63 //
64 // That's why we go to all this trouble to make interlaced_x_weights,
65 // and here we're interlacing A0 with A1, B0 with B1 to match.
66
67 __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
68 interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));
69
70 return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
71 interlaced_x_weights);
72 }
73
74 // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
75 // Returns two pixels, with each channel in a 16-bit lane of the __m128i.
interpolate_in_x_and_y(uint32_t A0,uint32_t A1,uint32_t A2,uint32_t A3,uint32_t B0,uint32_t B1,uint32_t B2,uint32_t B3,const __m128i & interlaced_x_weights,int wy)76 static inline __m128i interpolate_in_x_and_y(uint32_t A0, uint32_t A1,
77 uint32_t A2, uint32_t A3,
78 uint32_t B0, uint32_t B1,
79 uint32_t B2, uint32_t B3,
80 const __m128i& interlaced_x_weights,
81 int wy) {
82 // The stored Y weight wy is for y1, and y0 gets a weight 16-wy.
83 const __m128i wy1 = _mm_set1_epi16(wy),
84 wy0 = _mm_sub_epi16(_mm_set1_epi16(16), wy1);
85
86 // First interpolate in X,
87 // leaving the values in 16-bit lanes scaled up by those [0,16] interlaced_x_weights.
88 __m128i row0 = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
89 row1 = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
90
91 // Interpolate in Y across the two rows,
92 // then scale everything down by the maximum total weight 16x16 = 256.
93 return _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(row0, wy0),
94 _mm_mullo_epi16(row1, wy1)), 8);
95 }
96
97 /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)98 void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
99 const uint32_t* xy, int count, uint32_t* colors) {
100 SkASSERT(count > 0 && colors != nullptr);
101 SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
102 SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
103
104 int alpha = s.fAlphaScale;
105
106 // Return (px * s.fAlphaScale) / 256. (s.fAlphaScale is in [0,256].)
107 auto scale_by_alpha = [alpha](const __m128i& px) {
108 return alpha == 256 ? px
109 : _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(alpha)), 8);
110 };
111
112 // We're in _DX_ mode here, so we're only varying in X.
113 // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
114 // All the other entries in xy will be pairs of X coordinates and the X weight.
115 int y0, y1, wy;
116 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
117
118 auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
119 row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());
120
121 while (count >= 4) {
122 // We can really get going, loading 4 X pairs at a time to produce 4 output pixels.
123 const __m128i xx = _mm_loadu_si128((const __m128i*)xy);
124
125 int x0[4],
126 x1[4];
127 __m128i wx;
128 decode_packed_coordinates_and_weight(xx, x0, x1, &wx);
129
130 // Splat out each x weight wx four times (one for each pixel channel) as wx1,
131 // and sixteen minus that as the weight for x0, wx0.
132 __m128i wx1 = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
133 wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1);
134
135 // We need to interlace wx0 and wx1 for _mm_maddubs_epi16().
136 __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wx0,wx1),
137 interlaced_x_weights_CD = _mm_unpackhi_epi8(wx0,wx1);
138
139 // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time
140 // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
141 __m128i AB = interpolate_in_x_and_y(row0[x0[0]], row0[x1[0]],
142 row1[x0[0]], row1[x1[0]],
143 row0[x0[1]], row0[x1[1]],
144 row1[x0[1]], row1[x1[1]],
145 interlaced_x_weights_AB, wy);
146
147 // Once more with the other half of the x-weights for two more pixels C,D.
148 __m128i CD = interpolate_in_x_and_y(row0[x0[2]], row0[x1[2]],
149 row1[x0[2]], row1[x1[2]],
150 row0[x0[3]], row0[x1[3]],
151 row1[x0[3]], row1[x1[3]],
152 interlaced_x_weights_CD, wy);
153
154 // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!
155 _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(scale_by_alpha(AB),
156 scale_by_alpha(CD)));
157 xy += 4;
158 colors += 4;
159 count -= 4;
160 }
161
162 while (count --> 0) {
163 // This is exactly the same flow as the count >= 4 loop above, but writing one pixel.
164 int x0, x1, wx;
165 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
166
167 // As above, splat out wx four times as wx1, and sixteen minus that as wx0.
168 __m128i wx1 = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine.
169 wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1);
170
171 __m128i interlaced_x_weights_A = _mm_unpacklo_epi8(wx0, wx1);
172
173 __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
174 row1[x0], row1[x1],
175 0, 0,
176 0, 0,
177 interlaced_x_weights_A, wy);
178
179 *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(scale_by_alpha(A), _mm_setzero_si128()));
180 }
181 }
182
183
184 #elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
185
186 // TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc.
187
188 /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)189 void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
190 const uint32_t* xy, int count, uint32_t* colors) {
191 SkASSERT(count > 0 && colors != nullptr);
192 SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
193 SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
194 SkASSERT(s.fAlphaScale <= 256);
195
196 int y0, y1, wy;
197 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
198
199 auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
200 row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
201
202 // We'll put one pixel in the low 4 16-bit lanes to line up with wy,
203 // and another in the upper 4 16-bit lanes to line up with 16 - wy.
204 const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy),
205 _mm_set1_epi16(16-wy));
206
207 while (count --> 0) {
208 int x0, x1, wx;
209 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
210
211 // Load the 4 pixels we're interpolating.
212 const __m128i a00 = _mm_cvtsi32_si128(row0[x0]),
213 a01 = _mm_cvtsi32_si128(row0[x1]),
214 a10 = _mm_cvtsi32_si128(row1[x0]),
215 a11 = _mm_cvtsi32_si128(row1[x1]);
216
217 // Line up low-x pixels a00 and a10 with allY.
218 __m128i a00a10 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(a10, a00),
219 _mm_setzero_si128());
220
221 // Scale by allY and 16-wx.
222 a00a10 = _mm_mullo_epi16(a00a10, allY);
223 a00a10 = _mm_mullo_epi16(a00a10, _mm_set1_epi16(16-wx));
224
225
226 // Line up high-x pixels a01 and a11 with allY.
227 __m128i a01a11 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(a11, a01),
228 _mm_setzero_si128());
229
230 // Scale by allY and wx.
231 a01a11 = _mm_mullo_epi16(a01a11, allY);
232 a01a11 = _mm_mullo_epi16(a01a11, _mm_set1_epi16(wx));
233
234
235 // Add the two intermediates, summing across in one direction.
236 __m128i halves = _mm_add_epi16(a00a10, a01a11);
237
238 // Add the two halves to each other to sum in the other direction.
239 __m128i sum = _mm_add_epi16(halves, _mm_srli_si128(halves, 8));
240
241 // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
242 sum = _mm_srli_epi16(sum, 8);
243
244 if (s.fAlphaScale < 256) {
245 // Scale by alpha, which is in [0,256].
246 sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));
247 sum = _mm_srli_epi16(sum, 8);
248 }
249
250 // Pack back into 8-bit values and store.
251 *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
252 }
253 }
254
255 #else
256
257 // The NEON code only actually differs from the portable code in the
258 // filtering step after we've loaded all four pixels we want to bilerp.
259
260 #if defined(SK_ARM_HAS_NEON)
filter_and_scale_by_alpha(unsigned x,unsigned y,SkPMColor a00,SkPMColor a01,SkPMColor a10,SkPMColor a11,SkPMColor * dst,uint16_t scale)261 static void filter_and_scale_by_alpha(unsigned x, unsigned y,
262 SkPMColor a00, SkPMColor a01,
263 SkPMColor a10, SkPMColor a11,
264 SkPMColor *dst,
265 uint16_t scale) {
266 uint8x8_t vy, vconst16_8, v16_y, vres;
267 uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
268 uint32x2_t va0, va1;
269 uint16x8_t tmp1, tmp2;
270
271 vy = vdup_n_u8(y); // duplicate y into vy
272 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8
273 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y
274
275 va0 = vdup_n_u32(a00); // duplicate a00
276 va1 = vdup_n_u32(a10); // duplicate a10
277 va0 = vset_lane_u32(a01, va0, 1); // set top to a01
278 va1 = vset_lane_u32(a11, va1, 1); // set top to a11
279
280 tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
281 tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y
282
283 vx = vdup_n_u16(x); // duplicate x into vx
284 vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16
285 v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
286
287 tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x
288 tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x
289 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
290 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
291
292 if (scale < 256) {
293 vscale = vdup_n_u16(scale); // duplicate scale
294 tmp = vshr_n_u16(tmp, 8); // shift down result by 8
295 tmp = vmul_u16(tmp, vscale); // multiply result by scale
296 }
297
298 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8
299 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result
300 }
301 #else
filter_and_scale_by_alpha(unsigned x,unsigned y,SkPMColor a00,SkPMColor a01,SkPMColor a10,SkPMColor a11,SkPMColor * dstColor,unsigned alphaScale)302 static void filter_and_scale_by_alpha(unsigned x, unsigned y,
303 SkPMColor a00, SkPMColor a01,
304 SkPMColor a10, SkPMColor a11,
305 SkPMColor* dstColor,
306 unsigned alphaScale) {
307 SkASSERT((unsigned)x <= 0xF);
308 SkASSERT((unsigned)y <= 0xF);
309 SkASSERT(alphaScale <= 256);
310
311 int xy = x * y;
312 const uint32_t mask = 0xFF00FF;
313
314 int scale = 256 - 16*y - 16*x + xy;
315 uint32_t lo = (a00 & mask) * scale;
316 uint32_t hi = ((a00 >> 8) & mask) * scale;
317
318 scale = 16*x - xy;
319 lo += (a01 & mask) * scale;
320 hi += ((a01 >> 8) & mask) * scale;
321
322 scale = 16*y - xy;
323 lo += (a10 & mask) * scale;
324 hi += ((a10 >> 8) & mask) * scale;
325
326 lo += (a11 & mask) * xy;
327 hi += ((a11 >> 8) & mask) * xy;
328
329 if (alphaScale < 256) {
330 lo = ((lo >> 8) & mask) * alphaScale;
331 hi = ((hi >> 8) & mask) * alphaScale;
332 }
333
334 *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
335 }
336 #endif
337
338
339 /*not static*/ inline
S32_alpha_D32_filter_DX(const SkBitmapProcState & s,const uint32_t * xy,int count,SkPMColor * colors)340 void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
341 const uint32_t* xy, int count, SkPMColor* colors) {
342 SkASSERT(count > 0 && colors != nullptr);
343 SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
344 SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
345 SkASSERT(s.fAlphaScale <= 256);
346
347 int y0, y1, wy;
348 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
349
350 auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
351 row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
352
353 while (count --> 0) {
354 int x0, x1, wx;
355 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
356
357 filter_and_scale_by_alpha(wx, wy,
358 row0[x0], row0[x1],
359 row1[x0], row1[x1],
360 colors++,
361 s.fAlphaScale);
362 }
363 }
364
365 #endif
366
367 } // namespace SK_OPTS_NS
368
369 #endif
370