1 /*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "SkBlitRow_opts_arm_neon.h"
9
10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h"
12 #include "SkColorPriv.h"
13 #include "SkDither.h"
14 #include "SkMathPriv.h"
15 #include "SkUtils.h"
16
17 #include "SkColor_opts_neon.h"
18 #include <arm_neon.h>
19
20 #ifdef SK_CPU_ARM64
sk_vld4_u8_arm64_3(const SkPMColor * SK_RESTRICT & src)21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
22 uint8x8x4_t vsrc;
23 uint8x8_t vsrc_0, vsrc_1, vsrc_2;
24
25 asm (
26 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
27 "mov %[vsrc0].8b, v0.8b \t\n"
28 "mov %[vsrc1].8b, v1.8b \t\n"
29 "mov %[vsrc2].8b, v2.8b \t\n"
30 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
31 [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
32 : : "v0", "v1", "v2", "v3"
33 );
34
35 vsrc.val[0] = vsrc_0;
36 vsrc.val[1] = vsrc_1;
37 vsrc.val[2] = vsrc_2;
38
39 return vsrc;
40 }
41
sk_vld4_u8_arm64_4(const SkPMColor * SK_RESTRICT & src)42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
43 uint8x8x4_t vsrc;
44 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
45
46 asm (
47 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
48 "mov %[vsrc0].8b, v0.8b \t\n"
49 "mov %[vsrc1].8b, v1.8b \t\n"
50 "mov %[vsrc2].8b, v2.8b \t\n"
51 "mov %[vsrc3].8b, v3.8b \t\n"
52 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
53 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
54 [src] "+&r" (src)
55 : : "v0", "v1", "v2", "v3"
56 );
57
58 vsrc.val[0] = vsrc_0;
59 vsrc.val[1] = vsrc_1;
60 vsrc.val[2] = vsrc_2;
61 vsrc.val[3] = vsrc_3;
62
63 return vsrc;
64 }
65 #endif
66
S32_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
68 const SkPMColor* SK_RESTRICT src, int count,
69 U8CPU alpha, int /*x*/, int /*y*/) {
70 SkASSERT(255 == alpha);
71
72 while (count >= 8) {
73 uint8x8x4_t vsrc;
74 uint16x8_t vdst;
75
76 // Load
77 #ifdef SK_CPU_ARM64
78 vsrc = sk_vld4_u8_arm64_3(src);
79 #else
80 vsrc = vld4_u8((uint8_t*)src);
81 src += 8;
82 #endif
83
84 // Convert src to 565
85 vdst = SkPixel32ToPixel16_neon8(vsrc);
86
87 // Store
88 vst1q_u16(dst, vdst);
89
90 // Prepare next iteration
91 dst += 8;
92 count -= 8;
93 };
94
95 // Leftovers
96 while (count > 0) {
97 SkPMColor c = *src++;
98 SkPMColorAssert(c);
99 *dst = SkPixel32ToPixel16_ToU16(c);
100 dst++;
101 count--;
102 };
103 }
104
S32_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
106 const SkPMColor* SK_RESTRICT src, int count,
107 U8CPU alpha, int /*x*/, int /*y*/) {
108 SkASSERT(255 > alpha);
109
110 uint16x8_t vmask_blue, vscale;
111
112 // prepare constants
113 vscale = vdupq_n_u16(SkAlpha255To256(alpha));
114 vmask_blue = vmovq_n_u16(0x1F);
115
116 while (count >= 8) {
117 uint8x8x4_t vsrc;
118 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
119 uint16x8_t vres_r, vres_g, vres_b;
120
121 // Load src
122 #ifdef SK_CPU_ARM64
123 vsrc = sk_vld4_u8_arm64_3(src);
124 #else
125 {
126 register uint8x8_t d0 asm("d0");
127 register uint8x8_t d1 asm("d1");
128 register uint8x8_t d2 asm("d2");
129 register uint8x8_t d3 asm("d3");
130
131 asm (
132 "vld4.8 {d0-d3},[%[src]]!"
133 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
134 :
135 );
136 vsrc.val[0] = d0;
137 vsrc.val[1] = d1;
138 vsrc.val[2] = d2;
139 }
140 #endif
141
142 // Load and unpack dst
143 vdst = vld1q_u16(dst);
144 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes
145 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
146 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red
147 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green
148
149 // Shift src to 565 range
150 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
151 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
152 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
153
154 // Scale src - dst
155 vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
156 vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
157 vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
158
159 vres_r = vshrq_n_u16(vres_r * vscale, 8);
160 vres_g = vshrq_n_u16(vres_g * vscale, 8);
161 vres_b = vshrq_n_u16(vres_b * vscale, 8);
162
163 vres_r += vdst_r;
164 vres_g += vdst_g;
165 vres_b += vdst_b;
166
167 // Combine
168 vres_b = vsliq_n_u16(vres_b, vres_g, 5); // insert green into blue
169 vres_b = vsliq_n_u16(vres_b, vres_r, 6+5); // insert red into green/blue
170
171 // Store
172 vst1q_u16(dst, vres_b);
173 dst += 8;
174 count -= 8;
175 }
176 if (count > 0) {
177 int scale = SkAlpha255To256(alpha);
178 do {
179 SkPMColor c = *src++;
180 SkPMColorAssert(c);
181 uint16_t d = *dst;
182 *dst++ = SkPackRGB16(
183 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
184 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
185 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
186 } while (--count != 0);
187 }
188 }
189
190 #ifdef SK_CPU_ARM32
S32A_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
192 const SkPMColor* SK_RESTRICT src, int count,
193 U8CPU alpha, int /*x*/, int /*y*/) {
194 SkASSERT(255 == alpha);
195
196 if (count >= 8) {
197 uint16_t* SK_RESTRICT keep_dst = 0;
198
199 asm volatile (
200 "ands ip, %[count], #7 \n\t"
201 "vmov.u8 d31, #1<<7 \n\t"
202 "vld1.16 {q12}, [%[dst]] \n\t"
203 "vld4.8 {d0-d3}, [%[src]] \n\t"
204 // Thumb does not support the standard ARM conditional
205 // instructions but instead requires the 'it' instruction
206 // to signal conditional execution
207 "it eq \n\t"
208 "moveq ip, #8 \n\t"
209 "mov %[keep_dst], %[dst] \n\t"
210
211 "add %[src], %[src], ip, LSL#2 \n\t"
212 "add %[dst], %[dst], ip, LSL#1 \n\t"
213 "subs %[count], %[count], ip \n\t"
214 "b 9f \n\t"
215 // LOOP
216 "2: \n\t"
217
218 "vld1.16 {q12}, [%[dst]]! \n\t"
219 "vld4.8 {d0-d3}, [%[src]]! \n\t"
220 "vst1.16 {q10}, [%[keep_dst]] \n\t"
221 "sub %[keep_dst], %[dst], #8*2 \n\t"
222 "subs %[count], %[count], #8 \n\t"
223 "9: \n\t"
224 "pld [%[dst],#32] \n\t"
225 // expand 0565 q12 to 8888 {d4-d7}
226 "vmovn.u16 d4, q12 \n\t"
227 "vshr.u16 q11, q12, #5 \n\t"
228 "vshr.u16 q10, q12, #6+5 \n\t"
229 "vmovn.u16 d5, q11 \n\t"
230 "vmovn.u16 d6, q10 \n\t"
231 "vshl.u8 d4, d4, #3 \n\t"
232 "vshl.u8 d5, d5, #2 \n\t"
233 "vshl.u8 d6, d6, #3 \n\t"
234
235 "vmovl.u8 q14, d31 \n\t"
236 "vmovl.u8 q13, d31 \n\t"
237 "vmovl.u8 q12, d31 \n\t"
238
239 // duplicate in 4/2/1 & 8pix vsns
240 "vmvn.8 d30, d3 \n\t"
241 "vmlal.u8 q14, d30, d6 \n\t"
242 "vmlal.u8 q13, d30, d5 \n\t"
243 "vmlal.u8 q12, d30, d4 \n\t"
244 "vshr.u16 q8, q14, #5 \n\t"
245 "vshr.u16 q9, q13, #6 \n\t"
246 "vaddhn.u16 d6, q14, q8 \n\t"
247 "vshr.u16 q8, q12, #5 \n\t"
248 "vaddhn.u16 d5, q13, q9 \n\t"
249 "vqadd.u8 d6, d6, d0 \n\t" // moved up
250 "vaddhn.u16 d4, q12, q8 \n\t"
251 // intentionally don't calculate alpha
252 // result in d4-d6
253
254 "vqadd.u8 d5, d5, d1 \n\t"
255 "vqadd.u8 d4, d4, d2 \n\t"
256
257 // pack 8888 {d4-d6} to 0565 q10
258 "vshll.u8 q10, d6, #8 \n\t"
259 "vshll.u8 q3, d5, #8 \n\t"
260 "vshll.u8 q2, d4, #8 \n\t"
261 "vsri.u16 q10, q3, #5 \n\t"
262 "vsri.u16 q10, q2, #11 \n\t"
263
264 "bne 2b \n\t"
265
266 "1: \n\t"
267 "vst1.16 {q10}, [%[keep_dst]] \n\t"
268 : [count] "+r" (count)
269 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
270 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
271 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
272 "d30","d31"
273 );
274 }
275 else
276 { // handle count < 8
277 uint16_t* SK_RESTRICT keep_dst = 0;
278
279 asm volatile (
280 "vmov.u8 d31, #1<<7 \n\t"
281 "mov %[keep_dst], %[dst] \n\t"
282
283 "tst %[count], #4 \n\t"
284 "beq 14f \n\t"
285 "vld1.16 {d25}, [%[dst]]! \n\t"
286 "vld1.32 {q1}, [%[src]]! \n\t"
287
288 "14: \n\t"
289 "tst %[count], #2 \n\t"
290 "beq 12f \n\t"
291 "vld1.32 {d24[1]}, [%[dst]]! \n\t"
292 "vld1.32 {d1}, [%[src]]! \n\t"
293
294 "12: \n\t"
295 "tst %[count], #1 \n\t"
296 "beq 11f \n\t"
297 "vld1.16 {d24[1]}, [%[dst]]! \n\t"
298 "vld1.32 {d0[1]}, [%[src]]! \n\t"
299
300 "11: \n\t"
301 // unzips achieve the same as a vld4 operation
302 "vuzpq.u16 q0, q1 \n\t"
303 "vuzp.u8 d0, d1 \n\t"
304 "vuzp.u8 d2, d3 \n\t"
305 // expand 0565 q12 to 8888 {d4-d7}
306 "vmovn.u16 d4, q12 \n\t"
307 "vshr.u16 q11, q12, #5 \n\t"
308 "vshr.u16 q10, q12, #6+5 \n\t"
309 "vmovn.u16 d5, q11 \n\t"
310 "vmovn.u16 d6, q10 \n\t"
311 "vshl.u8 d4, d4, #3 \n\t"
312 "vshl.u8 d5, d5, #2 \n\t"
313 "vshl.u8 d6, d6, #3 \n\t"
314
315 "vmovl.u8 q14, d31 \n\t"
316 "vmovl.u8 q13, d31 \n\t"
317 "vmovl.u8 q12, d31 \n\t"
318
319 // duplicate in 4/2/1 & 8pix vsns
320 "vmvn.8 d30, d3 \n\t"
321 "vmlal.u8 q14, d30, d6 \n\t"
322 "vmlal.u8 q13, d30, d5 \n\t"
323 "vmlal.u8 q12, d30, d4 \n\t"
324 "vshr.u16 q8, q14, #5 \n\t"
325 "vshr.u16 q9, q13, #6 \n\t"
326 "vaddhn.u16 d6, q14, q8 \n\t"
327 "vshr.u16 q8, q12, #5 \n\t"
328 "vaddhn.u16 d5, q13, q9 \n\t"
329 "vqadd.u8 d6, d6, d0 \n\t" // moved up
330 "vaddhn.u16 d4, q12, q8 \n\t"
331 // intentionally don't calculate alpha
332 // result in d4-d6
333
334 "vqadd.u8 d5, d5, d1 \n\t"
335 "vqadd.u8 d4, d4, d2 \n\t"
336
337 // pack 8888 {d4-d6} to 0565 q10
338 "vshll.u8 q10, d6, #8 \n\t"
339 "vshll.u8 q3, d5, #8 \n\t"
340 "vshll.u8 q2, d4, #8 \n\t"
341 "vsri.u16 q10, q3, #5 \n\t"
342 "vsri.u16 q10, q2, #11 \n\t"
343
344 // store
345 "tst %[count], #4 \n\t"
346 "beq 24f \n\t"
347 "vst1.16 {d21}, [%[keep_dst]]! \n\t"
348
349 "24: \n\t"
350 "tst %[count], #2 \n\t"
351 "beq 22f \n\t"
352 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t"
353
354 "22: \n\t"
355 "tst %[count], #1 \n\t"
356 "beq 21f \n\t"
357 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t"
358
359 "21: \n\t"
360 : [count] "+r" (count)
361 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
362 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
363 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
364 "d30","d31"
365 );
366 }
367 }
368 #endif
369
SkDiv255Round_neon8(uint16x8_t prod)370 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
371 prod += vdupq_n_u16(128);
372 prod += vshrq_n_u16(prod, 8);
373 return vshrq_n_u16(prod, 8);
374 }
375
S32A_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)376 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
377 const SkPMColor* SK_RESTRICT src, int count,
378 U8CPU alpha, int /*x*/, int /*y*/) {
379 SkASSERT(255 > alpha);
380
381 /* This code implements a Neon version of S32A_D565_Blend. The results have
382 * a few mismatches compared to the original code. These mismatches never
383 * exceed 1.
384 */
385
386 if (count >= 8) {
387 uint16x8_t valpha_max, vmask_blue;
388 uint8x8_t valpha;
389
390 // prepare constants
391 valpha_max = vmovq_n_u16(255);
392 valpha = vdup_n_u8(alpha);
393 vmask_blue = vmovq_n_u16(SK_B16_MASK);
394
395 do {
396 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
397 uint16x8_t vres_a, vres_r, vres_g, vres_b;
398 uint8x8x4_t vsrc;
399
400 // load pixels
401 vdst = vld1q_u16(dst);
402 #ifdef SK_CPU_ARM64
403 vsrc = sk_vld4_u8_arm64_4(src);
404 #else
405 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
406 asm (
407 "vld4.u8 %h[vsrc], [%[src]]!"
408 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
409 : :
410 );
411 #else
412 register uint8x8_t d0 asm("d0");
413 register uint8x8_t d1 asm("d1");
414 register uint8x8_t d2 asm("d2");
415 register uint8x8_t d3 asm("d3");
416
417 asm volatile (
418 "vld4.u8 {d0-d3},[%[src]]!;"
419 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
420 [src] "+&r" (src)
421 : :
422 );
423 vsrc.val[0] = d0;
424 vsrc.val[1] = d1;
425 vsrc.val[2] = d2;
426 vsrc.val[3] = d3;
427 #endif
428 #endif // #ifdef SK_CPU_ARM64
429
430
431 // deinterleave dst
432 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes
433 vdst_b = vdst & vmask_blue; // extract blue
434 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red
435 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
436
437 // shift src to 565
438 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
439 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
440 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
441
442 // calc src * src_scale
443 vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
444 vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
445 vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
446 vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
447
448 // prepare dst_scale
449 vres_a = SkDiv255Round_neon8(vres_a);
450 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
451
452 // add dst * dst_scale to previous result
453 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
454 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
455 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
456
457 #ifdef S32A_D565_BLEND_EXACT
458 // It is possible to get exact results with this but it is slow,
459 // even slower than C code in some cases
460 vres_r = SkDiv255Round_neon8(vres_r);
461 vres_g = SkDiv255Round_neon8(vres_g);
462 vres_b = SkDiv255Round_neon8(vres_b);
463 #else
464 vres_r = vrshrq_n_u16(vres_r, 8);
465 vres_g = vrshrq_n_u16(vres_g, 8);
466 vres_b = vrshrq_n_u16(vres_b, 8);
467 #endif
468 // pack result
469 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
470 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
471
472 // store
473 vst1q_u16(dst, vres_b);
474 dst += 8;
475 count -= 8;
476 } while (count >= 8);
477 }
478
479 // leftovers
480 while (count-- > 0) {
481 SkPMColor sc = *src++;
482 if (sc) {
483 uint16_t dc = *dst;
484 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
485 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
486 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
487 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
488 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
489 }
490 dst += 1;
491 }
492 }
493
494 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
495 * each dither value is spaced out into byte lanes, and repeated
496 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
497 * start of each row.
498 */
499 static const uint8_t gDitherMatrix_Neon[48] = {
500 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
501 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
502 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
503 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
504
505 };
506
S32_D565_Blend_Dither_neon(uint16_t * dst,const SkPMColor * src,int count,U8CPU alpha,int x,int y)507 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
508 int count, U8CPU alpha, int x, int y)
509 {
510
511 SkASSERT(255 > alpha);
512
513 // rescale alpha to range 1 - 256
514 int scale = SkAlpha255To256(alpha);
515
516 if (count >= 8) {
517 /* select row and offset for dither array */
518 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
519
520 uint8x8_t vdither = vld1_u8(dstart); // load dither values
521 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
522
523 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg
524 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask
525
526 do {
527
528 uint8x8x4_t vsrc;
529 uint8x8_t vsrc_r, vsrc_g, vsrc_b;
530 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
531 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
532 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
533 uint16x8_t vdst;
534 uint16x8_t vdst_r, vdst_g, vdst_b;
535 int16x8_t vres_r, vres_g, vres_b;
536 int8x8_t vres8_r, vres8_g, vres8_b;
537
538 // Load source and add dither
539 #ifdef SK_CPU_ARM64
540 vsrc = sk_vld4_u8_arm64_3(src);
541 #else
542 {
543 register uint8x8_t d0 asm("d0");
544 register uint8x8_t d1 asm("d1");
545 register uint8x8_t d2 asm("d2");
546 register uint8x8_t d3 asm("d3");
547
548 asm (
549 "vld4.8 {d0-d3},[%[src]]! "
550 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
551 :
552 );
553 vsrc.val[0] = d0;
554 vsrc.val[1] = d1;
555 vsrc.val[2] = d2;
556 }
557 #endif
558 vsrc_r = vsrc.val[NEON_R];
559 vsrc_g = vsrc.val[NEON_G];
560 vsrc_b = vsrc.val[NEON_B];
561
562 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
563 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
564 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
565
566 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
567 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen
568 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen
569
570 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result
571 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result
572 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result
573
574 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
575 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
576 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
577
578 // Load dst and unpack
579 vdst = vld1q_u16(dst);
580 vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green
581 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
582 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue
583
584 // subtract dst from src and widen
585 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
586 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
587 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
588
589 // multiply diffs by scale and shift
590 vres_r = vmulq_s16(vres_r, vscale);
591 vres_g = vmulq_s16(vres_g, vscale);
592 vres_b = vmulq_s16(vres_b, vscale);
593
594 vres8_r = vshrn_n_s16(vres_r, 8);
595 vres8_g = vshrn_n_s16(vres_g, 8);
596 vres8_b = vshrn_n_s16(vres_b, 8);
597
598 // add dst to result
599 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
600 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
601 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
602
603 // put result into 565 format
604 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue
605 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
606
607 // Store result
608 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
609
610 // Next iteration
611 dst += 8;
612 count -= 8;
613
614 } while (count >= 8);
615 }
616
617 // Leftovers
618 if (count > 0) {
619 int scale = SkAlpha255To256(alpha);
620 DITHER_565_SCAN(y);
621 do {
622 SkPMColor c = *src++;
623 SkPMColorAssert(c);
624
625 int dither = DITHER_VALUE(x);
626 int sr = SkGetPackedR32(c);
627 int sg = SkGetPackedG32(c);
628 int sb = SkGetPackedB32(c);
629 sr = SkDITHER_R32To565(sr, dither);
630 sg = SkDITHER_G32To565(sg, dither);
631 sb = SkDITHER_B32To565(sb, dither);
632
633 uint16_t d = *dst;
634 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
635 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
636 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
637 DITHER_INC_X(x);
638 } while (--count != 0);
639 }
640 }
641
S32A_Opaque_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)642 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
643 const SkPMColor* SK_RESTRICT src,
644 int count, U8CPU alpha) {
645
646 SkASSERT(255 == alpha);
647 if (count > 0) {
648
649
650 uint8x8_t alpha_mask;
651
652 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
653 alpha_mask = vld1_u8(alpha_mask_setup);
654
655 /* do the NEON unrolled code */
656 #define UNROLL 4
657 while (count >= UNROLL) {
658 uint8x8_t src_raw, dst_raw, dst_final;
659 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
660
661 /* The two prefetches below may make the code slighlty
662 * slower for small values of count but are worth having
663 * in the general case.
664 */
665 __builtin_prefetch(src+32);
666 __builtin_prefetch(dst+32);
667
668 /* get the source */
669 src_raw = vreinterpret_u8_u32(vld1_u32(src));
670 #if UNROLL > 2
671 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
672 #endif
673
674 /* get and hold the dst too */
675 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
676 #if UNROLL > 2
677 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
678 #endif
679
680 /* 1st and 2nd bits of the unrolling */
681 {
682 uint8x8_t dst_cooked;
683 uint16x8_t dst_wide;
684 uint8x8_t alpha_narrow;
685 uint16x8_t alpha_wide;
686
687 /* get the alphas spread out properly */
688 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
689 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
690
691 /* spread the dest */
692 dst_wide = vmovl_u8(dst_raw);
693
694 /* alpha mul the dest */
695 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
696 dst_cooked = vshrn_n_u16(dst_wide, 8);
697
698 /* sum -- ignoring any byte lane overflows */
699 dst_final = vadd_u8(src_raw, dst_cooked);
700 }
701
702 #if UNROLL > 2
703 /* the 3rd and 4th bits of our unrolling */
704 {
705 uint8x8_t dst_cooked;
706 uint16x8_t dst_wide;
707 uint8x8_t alpha_narrow;
708 uint16x8_t alpha_wide;
709
710 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
711 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
712
713 /* spread the dest */
714 dst_wide = vmovl_u8(dst_raw_2);
715
716 /* alpha mul the dest */
717 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
718 dst_cooked = vshrn_n_u16(dst_wide, 8);
719
720 /* sum -- ignoring any byte lane overflows */
721 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
722 }
723 #endif
724
725 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
726 #if UNROLL > 2
727 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
728 #endif
729
730 src += UNROLL;
731 dst += UNROLL;
732 count -= UNROLL;
733 }
734 #undef UNROLL
735
736 /* do any residual iterations */
737 while (--count >= 0) {
738 *dst = SkPMSrcOver(*src, *dst);
739 src += 1;
740 dst += 1;
741 }
742 }
743 }
744
S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)745 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
746 const SkPMColor* SK_RESTRICT src,
747 int count, U8CPU alpha) {
748 SkASSERT(255 == alpha);
749
750 if (count <= 0)
751 return;
752
753 /* Use these to check if src is transparent or opaque */
754 const unsigned int ALPHA_OPAQ = 0xFF000000;
755 const unsigned int ALPHA_TRANS = 0x00FFFFFF;
756
757 #define UNROLL 4
758 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
759 const SkPMColor* SK_RESTRICT src_temp = src;
760
761 /* set up the NEON variables */
762 uint8x8_t alpha_mask;
763 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
764 alpha_mask = vld1_u8(alpha_mask_setup);
765
766 uint8x8_t src_raw, dst_raw, dst_final;
767 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
768 uint8x8_t dst_cooked;
769 uint16x8_t dst_wide;
770 uint8x8_t alpha_narrow;
771 uint16x8_t alpha_wide;
772
773 /* choose the first processing type */
774 if( src >= src_end)
775 goto TAIL;
776 if(*src <= ALPHA_TRANS)
777 goto ALPHA_0;
778 if(*src >= ALPHA_OPAQ)
779 goto ALPHA_255;
780 /* fall-thru */
781
782 ALPHA_1_TO_254:
783 do {
784
785 /* get the source */
786 src_raw = vreinterpret_u8_u32(vld1_u32(src));
787 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
788
789 /* get and hold the dst too */
790 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
791 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
792
793
794 /* get the alphas spread out properly */
795 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
796 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
797 /* we collapsed (255-a)+1 ... */
798 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
799
800 /* spread the dest */
801 dst_wide = vmovl_u8(dst_raw);
802
803 /* alpha mul the dest */
804 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
805 dst_cooked = vshrn_n_u16(dst_wide, 8);
806
807 /* sum -- ignoring any byte lane overflows */
808 dst_final = vadd_u8(src_raw, dst_cooked);
809
810 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
811 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
812 /* we collapsed (255-a)+1 ... */
813 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
814
815 /* spread the dest */
816 dst_wide = vmovl_u8(dst_raw_2);
817
818 /* alpha mul the dest */
819 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
820 dst_cooked = vshrn_n_u16(dst_wide, 8);
821
822 /* sum -- ignoring any byte lane overflows */
823 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
824
825 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
826 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
827
828 src += UNROLL;
829 dst += UNROLL;
830
831 /* if 2 of the next pixels aren't between 1 and 254
832 it might make sense to go to the optimized loops */
833 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
834 break;
835
836 } while(src < src_end);
837
838 if (src >= src_end)
839 goto TAIL;
840
841 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
842 goto ALPHA_255;
843
844 /*fall-thru*/
845
846 ALPHA_0:
847
848 /*In this state, we know the current alpha is 0 and
849 we optimize for the next alpha also being zero. */
850 src_temp = src; //so we don't have to increment dst every time
851 do {
852 if(*(++src) > ALPHA_TRANS)
853 break;
854 if(*(++src) > ALPHA_TRANS)
855 break;
856 if(*(++src) > ALPHA_TRANS)
857 break;
858 if(*(++src) > ALPHA_TRANS)
859 break;
860 } while(src < src_end);
861
862 dst += (src - src_temp);
863
864 /* no longer alpha 0, so determine where to go next. */
865 if( src >= src_end)
866 goto TAIL;
867 if(*src >= ALPHA_OPAQ)
868 goto ALPHA_255;
869 else
870 goto ALPHA_1_TO_254;
871
872 ALPHA_255:
873 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
874 dst[0]=src[0];
875 dst[1]=src[1];
876 dst[2]=src[2];
877 dst[3]=src[3];
878 src+=UNROLL;
879 dst+=UNROLL;
880 if(src >= src_end)
881 goto TAIL;
882 }
883
884 //Handle remainder.
885 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
886 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
887 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
888 }
889 }
890
891 if( src >= src_end)
892 goto TAIL;
893 if(*src <= ALPHA_TRANS)
894 goto ALPHA_0;
895 else
896 goto ALPHA_1_TO_254;
897
898 TAIL:
899 /* do any residual iterations */
900 src_end += UNROLL + 1; //goto the real end
901 while(src != src_end) {
902 if( *src != 0 ) {
903 if( *src >= ALPHA_OPAQ ) {
904 *dst = *src;
905 }
906 else {
907 *dst = SkPMSrcOver(*src, *dst);
908 }
909 }
910 src++;
911 dst++;
912 }
913
914 #undef UNROLL
915 return;
916 }
917
918 /* Neon version of S32_Blend_BlitRow32()
919 * portable version is in src/core/SkBlitRow_D32.cpp
920 */
S32_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)921 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
922 const SkPMColor* SK_RESTRICT src,
923 int count, U8CPU alpha) {
924 SkASSERT(alpha <= 255);
925
926 if (count <= 0) {
927 return;
928 }
929
930 uint16_t src_scale = SkAlpha255To256(alpha);
931 uint16_t dst_scale = 256 - src_scale;
932
933 while (count >= 2) {
934 uint8x8_t vsrc, vdst, vres;
935 uint16x8_t vsrc_wide, vdst_wide;
936
937 /* These commented prefetches are a big win for count
938 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
939 * They also hurt a little (<5%) on an A15
940 */
941 //__builtin_prefetch(src+32);
942 //__builtin_prefetch(dst+32);
943
944 // Load
945 vsrc = vreinterpret_u8_u32(vld1_u32(src));
946 vdst = vreinterpret_u8_u32(vld1_u32(dst));
947
948 // Process src
949 vsrc_wide = vmovl_u8(vsrc);
950 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
951
952 // Process dst
953 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
954
955 // Combine
956 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
957
958 // Store
959 vst1_u32(dst, vreinterpret_u32_u8(vres));
960
961 src += 2;
962 dst += 2;
963 count -= 2;
964 }
965
966 if (count == 1) {
967 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
968 uint16x8_t vsrc_wide, vdst_wide;
969
970 // Load
971 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
972 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
973
974 // Process
975 vsrc_wide = vmovl_u8(vsrc);
976 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
977 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
978 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
979
980 // Store
981 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
982 }
983 }
984
985 #ifdef SK_CPU_ARM32
S32A_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)986 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
987 const SkPMColor* SK_RESTRICT src,
988 int count, U8CPU alpha) {
989
990 SkASSERT(255 >= alpha);
991
992 if (count <= 0) {
993 return;
994 }
995
996 unsigned alpha256 = SkAlpha255To256(alpha);
997
998 // First deal with odd counts
999 if (count & 1) {
1000 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1001 uint16x8_t vdst_wide, vsrc_wide;
1002 unsigned dst_scale;
1003
1004 // Load
1005 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1006 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1007
1008 // Calc dst_scale
1009 dst_scale = vget_lane_u8(vsrc, 3);
1010 dst_scale *= alpha256;
1011 dst_scale >>= 8;
1012 dst_scale = 256 - dst_scale;
1013
1014 // Process src
1015 vsrc_wide = vmovl_u8(vsrc);
1016 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
1017
1018 // Process dst
1019 vdst_wide = vmovl_u8(vdst);
1020 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
1021
1022 // Combine
1023 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1024
1025 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1026 dst++;
1027 src++;
1028 count--;
1029 }
1030
1031 if (count) {
1032 uint8x8_t alpha_mask;
1033 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
1034 alpha_mask = vld1_u8(alpha_mask_setup);
1035
1036 do {
1037
1038 uint8x8_t vsrc, vdst, vres, vsrc_alphas;
1039 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
1040
1041 __builtin_prefetch(src+32);
1042 __builtin_prefetch(dst+32);
1043
1044 // Load
1045 vsrc = vreinterpret_u8_u32(vld1_u32(src));
1046 vdst = vreinterpret_u8_u32(vld1_u32(dst));
1047
1048 // Prepare src_scale
1049 vsrc_scale = vdupq_n_u16(alpha256);
1050
1051 // Calc dst_scale
1052 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
1053 vdst_scale = vmovl_u8(vsrc_alphas);
1054 vdst_scale *= vsrc_scale;
1055 vdst_scale = vshrq_n_u16(vdst_scale, 8);
1056 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
1057
1058 // Process src
1059 vsrc_wide = vmovl_u8(vsrc);
1060 vsrc_wide *= vsrc_scale;
1061
1062 // Process dst
1063 vdst_wide = vmovl_u8(vdst);
1064 vdst_wide *= vdst_scale;
1065
1066 // Combine
1067 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1068
1069 vst1_u32(dst, vreinterpret_u32_u8(vres));
1070
1071 src += 2;
1072 dst += 2;
1073 count -= 2;
1074 } while(count);
1075 }
1076 }
1077
1078 ///////////////////////////////////////////////////////////////////////////////
1079
1080 #undef DEBUG_OPAQUE_DITHER
1081
1082 #if defined(DEBUG_OPAQUE_DITHER)
showme8(char * str,void * p,int len)1083 static void showme8(char *str, void *p, int len)
1084 {
1085 static char buf[256];
1086 char tbuf[32];
1087 int i;
1088 char *pc = (char*) p;
1089 sprintf(buf,"%8s:", str);
1090 for(i=0;i<len;i++) {
1091 sprintf(tbuf, " %02x", pc[i]);
1092 strcat(buf, tbuf);
1093 }
1094 SkDebugf("%s\n", buf);
1095 }
showme16(char * str,void * p,int len)1096 static void showme16(char *str, void *p, int len)
1097 {
1098 static char buf[256];
1099 char tbuf[32];
1100 int i;
1101 uint16_t *pc = (uint16_t*) p;
1102 sprintf(buf,"%8s:", str);
1103 len = (len / sizeof(uint16_t)); /* passed as bytes */
1104 for(i=0;i<len;i++) {
1105 sprintf(tbuf, " %04x", pc[i]);
1106 strcat(buf, tbuf);
1107 }
1108 SkDebugf("%s\n", buf);
1109 }
1110 #endif
1111 #endif // #ifdef SK_CPU_ARM32
1112
S32A_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1113 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1114 const SkPMColor* SK_RESTRICT src,
1115 int count, U8CPU alpha, int x, int y) {
1116 SkASSERT(255 == alpha);
1117
1118 #define UNROLL 8
1119
1120 if (count >= UNROLL) {
1121
1122 #if defined(DEBUG_OPAQUE_DITHER)
1123 uint16_t tmpbuf[UNROLL];
1124 int td[UNROLL];
1125 int tdv[UNROLL];
1126 int ta[UNROLL];
1127 int tap[UNROLL];
1128 uint16_t in_dst[UNROLL];
1129 int offset = 0;
1130 int noisy = 0;
1131 #endif
1132
1133 uint8x8_t dbase;
1134 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1135 dbase = vld1_u8(dstart);
1136
1137 do {
1138 uint8x8x4_t vsrc;
1139 uint8x8_t sr, sg, sb, sa, d;
1140 uint16x8_t dst8, scale8, alpha8;
1141 uint16x8_t dst_r, dst_g, dst_b;
1142
1143 #if defined(DEBUG_OPAQUE_DITHER)
1144 // calculate 8 elements worth into a temp buffer
1145 {
1146 int my_y = y;
1147 int my_x = x;
1148 SkPMColor* my_src = (SkPMColor*)src;
1149 uint16_t* my_dst = dst;
1150 int i;
1151
1152 DITHER_565_SCAN(my_y);
1153 for(i = 0; i < UNROLL; i++) {
1154 SkPMColor c = *my_src++;
1155 SkPMColorAssert(c);
1156 if (c) {
1157 unsigned a = SkGetPackedA32(c);
1158
1159 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1160 tdv[i] = DITHER_VALUE(my_x);
1161 ta[i] = a;
1162 tap[i] = SkAlpha255To256(a);
1163 td[i] = d;
1164
1165 unsigned sr = SkGetPackedR32(c);
1166 unsigned sg = SkGetPackedG32(c);
1167 unsigned sb = SkGetPackedB32(c);
1168 sr = SkDITHER_R32_FOR_565(sr, d);
1169 sg = SkDITHER_G32_FOR_565(sg, d);
1170 sb = SkDITHER_B32_FOR_565(sb, d);
1171
1172 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1173 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1174 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1175 // now src and dst expanded are in g:11 r:10 x:1 b:10
1176 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1177 td[i] = d;
1178 } else {
1179 tmpbuf[i] = *my_dst;
1180 ta[i] = tdv[i] = td[i] = 0xbeef;
1181 }
1182 in_dst[i] = *my_dst;
1183 my_dst += 1;
1184 DITHER_INC_X(my_x);
1185 }
1186 }
1187 #endif
1188
1189 #ifdef SK_CPU_ARM64
1190 vsrc = sk_vld4_u8_arm64_4(src);
1191 #else
1192 {
1193 register uint8x8_t d0 asm("d0");
1194 register uint8x8_t d1 asm("d1");
1195 register uint8x8_t d2 asm("d2");
1196 register uint8x8_t d3 asm("d3");
1197
1198 asm ("vld4.8 {d0-d3},[%[src]]! "
1199 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1200 :
1201 );
1202 vsrc.val[0] = d0;
1203 vsrc.val[1] = d1;
1204 vsrc.val[2] = d2;
1205 vsrc.val[3] = d3;
1206 }
1207 #endif
1208 sa = vsrc.val[NEON_A];
1209 sr = vsrc.val[NEON_R];
1210 sg = vsrc.val[NEON_G];
1211 sb = vsrc.val[NEON_B];
1212
1213 /* calculate 'd', which will be 0..7
1214 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1215 */
1216 alpha8 = vmovl_u8(dbase);
1217 alpha8 = vmlal_u8(alpha8, sa, dbase);
1218 d = vshrn_n_u16(alpha8, 8); // narrowing too
1219
1220 // sr = sr - (sr>>5) + d
1221 /* watching for 8-bit overflow. d is 0..7; risky range of
1222 * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1223 * safe as long as we do ((sr-sr>>5) + d)
1224 */
1225 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1226 sr = vadd_u8(sr, d);
1227
1228 // sb = sb - (sb>>5) + d
1229 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1230 sb = vadd_u8(sb, d);
1231
1232 // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1233 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1234 sg = vadd_u8(sg, vshr_n_u8(d,1));
1235
1236 // need to pick up 8 dst's -- at 16 bits each, 128 bits
1237 dst8 = vld1q_u16(dst);
1238 dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1239 dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1240 dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits
1241
1242 // blend
1243 scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1244
1245 // combine the addq and mul, save 3 insns
1246 scale8 = vshrq_n_u16(scale8, 3);
1247 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1248 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1249 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1250
1251 // repack to store
1252 dst8 = vshrq_n_u16(dst_b, 5);
1253 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1254 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1255
1256 vst1q_u16(dst, dst8);
1257
1258 #if defined(DEBUG_OPAQUE_DITHER)
1259 // verify my 8 elements match the temp buffer
1260 {
1261 int i, bad=0;
1262 static int invocation;
1263
1264 for (i = 0; i < UNROLL; i++) {
1265 if (tmpbuf[i] != dst[i]) {
1266 bad=1;
1267 }
1268 }
1269 if (bad) {
1270 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1271 invocation, offset);
1272 SkDebugf(" alpha 0x%x\n", alpha);
1273 for (i = 0; i < UNROLL; i++)
1274 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1275 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
1276 in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
1277
1278 showme16("alpha8", &alpha8, sizeof(alpha8));
1279 showme16("scale8", &scale8, sizeof(scale8));
1280 showme8("d", &d, sizeof(d));
1281 showme16("dst8", &dst8, sizeof(dst8));
1282 showme16("dst_b", &dst_b, sizeof(dst_b));
1283 showme16("dst_g", &dst_g, sizeof(dst_g));
1284 showme16("dst_r", &dst_r, sizeof(dst_r));
1285 showme8("sb", &sb, sizeof(sb));
1286 showme8("sg", &sg, sizeof(sg));
1287 showme8("sr", &sr, sizeof(sr));
1288
1289 return;
1290 }
1291 offset += UNROLL;
1292 invocation++;
1293 }
1294 #endif
1295 dst += UNROLL;
1296 count -= UNROLL;
1297 // skip x += UNROLL, since it's unchanged mod-4
1298 } while (count >= UNROLL);
1299 }
1300 #undef UNROLL
1301
1302 // residuals
1303 if (count > 0) {
1304 DITHER_565_SCAN(y);
1305 do {
1306 SkPMColor c = *src++;
1307 SkPMColorAssert(c);
1308 if (c) {
1309 unsigned a = SkGetPackedA32(c);
1310
1311 // dither and alpha are just temporary variables to work-around
1312 // an ICE in debug.
1313 unsigned dither = DITHER_VALUE(x);
1314 unsigned alpha = SkAlpha255To256(a);
1315 int d = SkAlphaMul(dither, alpha);
1316
1317 unsigned sr = SkGetPackedR32(c);
1318 unsigned sg = SkGetPackedG32(c);
1319 unsigned sb = SkGetPackedB32(c);
1320 sr = SkDITHER_R32_FOR_565(sr, d);
1321 sg = SkDITHER_G32_FOR_565(sg, d);
1322 sb = SkDITHER_B32_FOR_565(sb, d);
1323
1324 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1325 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1326 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1327 // now src and dst expanded are in g:11 r:10 x:1 b:10
1328 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1329 }
1330 dst += 1;
1331 DITHER_INC_X(x);
1332 } while (--count != 0);
1333 }
1334 }
1335
1336 ///////////////////////////////////////////////////////////////////////////////
1337
1338 #undef DEBUG_S32_OPAQUE_DITHER
1339
S32_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1340 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1341 const SkPMColor* SK_RESTRICT src,
1342 int count, U8CPU alpha, int x, int y) {
1343 SkASSERT(255 == alpha);
1344
1345 #define UNROLL 8
1346 if (count >= UNROLL) {
1347 uint8x8_t d;
1348 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1349 d = vld1_u8(dstart);
1350
1351 while (count >= UNROLL) {
1352 uint8x8_t sr, sg, sb;
1353 uint16x8_t dr, dg, db;
1354 uint16x8_t dst8;
1355 uint8x8x4_t vsrc;
1356
1357 #ifdef SK_CPU_ARM64
1358 vsrc = sk_vld4_u8_arm64_3(src);
1359 #else
1360 {
1361 register uint8x8_t d0 asm("d0");
1362 register uint8x8_t d1 asm("d1");
1363 register uint8x8_t d2 asm("d2");
1364 register uint8x8_t d3 asm("d3");
1365
1366 asm (
1367 "vld4.8 {d0-d3},[%[src]]! "
1368 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1369 :
1370 );
1371 vsrc.val[0] = d0;
1372 vsrc.val[1] = d1;
1373 vsrc.val[2] = d2;
1374 }
1375 #endif
1376 sr = vsrc.val[NEON_R];
1377 sg = vsrc.val[NEON_G];
1378 sb = vsrc.val[NEON_B];
1379
1380 /* XXX: if we want to prefetch, hide it in the above asm()
1381 * using the gcc __builtin_prefetch(), the prefetch will
1382 * fall to the bottom of the loop -- it won't stick up
1383 * at the top of the loop, just after the vld4.
1384 */
1385
1386 // sr = sr - (sr>>5) + d
1387 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1388 dr = vaddl_u8(sr, d);
1389
1390 // sb = sb - (sb>>5) + d
1391 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1392 db = vaddl_u8(sb, d);
1393
1394 // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1395 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1396 dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1397
1398 // pack high bits of each into 565 format (rgb, b is lsb)
1399 dst8 = vshrq_n_u16(db, 3);
1400 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1401 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1402
1403 // store it
1404 vst1q_u16(dst, dst8);
1405
1406 #if defined(DEBUG_S32_OPAQUE_DITHER)
1407 // always good to know if we generated good results
1408 {
1409 int i, myx = x, myy = y;
1410 DITHER_565_SCAN(myy);
1411 for (i=0;i<UNROLL;i++) {
1412 // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1413 SkPMColor c = src[i-8];
1414 unsigned dither = DITHER_VALUE(myx);
1415 uint16_t val = SkDitherRGB32To565(c, dither);
1416 if (val != dst[i]) {
1417 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1418 c, dither, val, dst[i], dstart[i]);
1419 }
1420 DITHER_INC_X(myx);
1421 }
1422 }
1423 #endif
1424
1425 dst += UNROLL;
1426 // we don't need to increment src as the asm above has already done it
1427 count -= UNROLL;
1428 x += UNROLL; // probably superfluous
1429 }
1430 }
1431 #undef UNROLL
1432
1433 // residuals
1434 if (count > 0) {
1435 DITHER_565_SCAN(y);
1436 do {
1437 SkPMColor c = *src++;
1438 SkPMColorAssert(c);
1439 SkASSERT(SkGetPackedA32(c) == 255);
1440
1441 unsigned dither = DITHER_VALUE(x);
1442 *dst++ = SkDitherRGB32To565(c, dither);
1443 DITHER_INC_X(x);
1444 } while (--count != 0);
1445 }
1446 }
1447
Color32_arm_neon(SkPMColor * dst,const SkPMColor * src,int count,SkPMColor color)1448 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1449 SkPMColor color) {
1450 if (count <= 0) {
1451 return;
1452 }
1453
1454 if (0 == color) {
1455 if (src != dst) {
1456 memcpy(dst, src, count * sizeof(SkPMColor));
1457 }
1458 return;
1459 }
1460
1461 unsigned colorA = SkGetPackedA32(color);
1462 if (255 == colorA) {
1463 sk_memset32(dst, color, count);
1464 return;
1465 }
1466
1467 unsigned scale = 256 - SkAlpha255To256(colorA);
1468
1469 if (count >= 8) {
1470 uint32x4_t vcolor;
1471 uint8x8_t vscale;
1472
1473 vcolor = vdupq_n_u32(color);
1474
1475 // scale numerical interval [0-255], so load as 8 bits
1476 vscale = vdup_n_u8(scale);
1477
1478 do {
1479 // load src color, 8 pixels, 4 64 bit registers
1480 // (and increment src).
1481 uint32x2x4_t vsrc;
1482 #if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
1483 asm (
1484 "vld1.32 %h[vsrc], [%[src]]!"
1485 : [vsrc] "=w" (vsrc), [src] "+r" (src)
1486 : :
1487 );
1488 #else // 64bit targets and Clang
1489 vsrc.val[0] = vld1_u32(src);
1490 vsrc.val[1] = vld1_u32(src+2);
1491 vsrc.val[2] = vld1_u32(src+4);
1492 vsrc.val[3] = vld1_u32(src+6);
1493 src += 8;
1494 #endif
1495
1496 // multiply long by scale, 64 bits at a time,
1497 // destination into a 128 bit register.
1498 uint16x8x4_t vtmp;
1499 vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);
1500 vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);
1501 vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);
1502 vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);
1503
1504 // shift the 128 bit registers, containing the 16
1505 // bit scaled values back to 8 bits, narrowing the
1506 // results to 64 bit registers.
1507 uint8x16x2_t vres;
1508 vres.val[0] = vcombine_u8(
1509 vshrn_n_u16(vtmp.val[0], 8),
1510 vshrn_n_u16(vtmp.val[1], 8));
1511 vres.val[1] = vcombine_u8(
1512 vshrn_n_u16(vtmp.val[2], 8),
1513 vshrn_n_u16(vtmp.val[3], 8));
1514
1515 // adding back the color, using 128 bit registers.
1516 uint32x4x2_t vdst;
1517 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
1518 vreinterpretq_u8_u32(vcolor));
1519 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
1520 vreinterpretq_u8_u32(vcolor));
1521
1522 // store back the 8 calculated pixels (2 128 bit
1523 // registers), and increment dst.
1524 #if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
1525 asm (
1526 "vst1.32 %h[vdst], [%[dst]]!"
1527 : [dst] "+r" (dst)
1528 : [vdst] "w" (vdst)
1529 : "memory"
1530 );
1531 #else // 64bit targets and Clang
1532 vst1q_u32(dst, vdst.val[0]);
1533 vst1q_u32(dst+4, vdst.val[1]);
1534 dst += 8;
1535 #endif
1536 count -= 8;
1537
1538 } while (count >= 8);
1539 }
1540
1541 while (count > 0) {
1542 *dst = color + SkAlphaMulQ(*src, scale);
1543 src += 1;
1544 dst += 1;
1545 count--;
1546 }
1547 }
1548
1549 ///////////////////////////////////////////////////////////////////////////////
1550
1551 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1552 // no dither
1553 S32_D565_Opaque_neon,
1554 S32_D565_Blend_neon,
1555 #ifdef SK_CPU_ARM32
1556 S32A_D565_Opaque_neon,
1557 #else
1558 NULL,
1559 #endif
1560 S32A_D565_Blend_neon,
1561
1562 // dither
1563 S32_D565_Opaque_Dither_neon,
1564 S32_D565_Blend_Dither_neon,
1565 S32A_D565_Opaque_Dither_neon,
1566 NULL, // S32A_D565_Blend_Dither
1567 };
1568
1569 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1570 NULL, // S32_Opaque,
1571 S32_Blend_BlitRow32_neon, // S32_Blend,
1572 /*
1573 * We have two choices for S32A_Opaque procs. The one reads the src alpha
1574 * value and attempts to optimize accordingly. The optimization is
1575 * sensitive to the source content and is not a win in all cases. For
1576 * example, if there are a lot of transitions between the alpha states,
1577 * the performance will almost certainly be worse. However, for many
1578 * common cases the performance is equivalent or better than the standard
1579 * case where we do not inspect the src alpha.
1580 */
1581 #if SK_A32_SHIFT == 24
1582 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1583 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1584 #else
1585 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1586 #endif
1587 #ifdef SK_CPU_ARM32
1588 S32A_Blend_BlitRow32_neon // S32A_Blend
1589 #else
1590 NULL
1591 #endif
1592 };
1593