• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "SkBlitRow_opts_arm_neon.h"
9 
10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h"
12 #include "SkColorPriv.h"
13 #include "SkDither.h"
14 #include "SkMathPriv.h"
15 #include "SkUtils.h"
16 
17 #include "SkColor_opts_neon.h"
18 #include <arm_neon.h>
19 
20 #ifdef SK_CPU_ARM64
sk_vld4_u8_arm64_3(const SkPMColor * SK_RESTRICT & src)21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
22     uint8x8x4_t vsrc;
23     uint8x8_t vsrc_0, vsrc_1, vsrc_2;
24 
25     asm (
26         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
27         "mov    %[vsrc0].8b, v0.8b             \t\n"
28         "mov    %[vsrc1].8b, v1.8b             \t\n"
29         "mov    %[vsrc2].8b, v2.8b             \t\n"
30         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
31           [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
32         : : "v0", "v1", "v2", "v3"
33     );
34 
35     vsrc.val[0] = vsrc_0;
36     vsrc.val[1] = vsrc_1;
37     vsrc.val[2] = vsrc_2;
38 
39     return vsrc;
40 }
41 
sk_vld4_u8_arm64_4(const SkPMColor * SK_RESTRICT & src)42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
43     uint8x8x4_t vsrc;
44     uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
45 
46     asm (
47         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
48         "mov    %[vsrc0].8b, v0.8b             \t\n"
49         "mov    %[vsrc1].8b, v1.8b             \t\n"
50         "mov    %[vsrc2].8b, v2.8b             \t\n"
51         "mov    %[vsrc3].8b, v3.8b             \t\n"
52         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
53           [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
54           [src] "+&r" (src)
55         : : "v0", "v1", "v2", "v3"
56     );
57 
58     vsrc.val[0] = vsrc_0;
59     vsrc.val[1] = vsrc_1;
60     vsrc.val[2] = vsrc_2;
61     vsrc.val[3] = vsrc_3;
62 
63     return vsrc;
64 }
65 #endif
66 
S32_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
68                            const SkPMColor* SK_RESTRICT src, int count,
69                            U8CPU alpha, int /*x*/, int /*y*/) {
70     SkASSERT(255 == alpha);
71 
72     while (count >= 8) {
73         uint8x8x4_t vsrc;
74         uint16x8_t vdst;
75 
76         // Load
77 #ifdef SK_CPU_ARM64
78         vsrc = sk_vld4_u8_arm64_3(src);
79 #else
80         vsrc = vld4_u8((uint8_t*)src);
81         src += 8;
82 #endif
83 
84         // Convert src to 565
85         vdst = SkPixel32ToPixel16_neon8(vsrc);
86 
87         // Store
88         vst1q_u16(dst, vdst);
89 
90         // Prepare next iteration
91         dst += 8;
92         count -= 8;
93     };
94 
95     // Leftovers
96     while (count > 0) {
97         SkPMColor c = *src++;
98         SkPMColorAssert(c);
99         *dst = SkPixel32ToPixel16_ToU16(c);
100         dst++;
101         count--;
102     };
103 }
104 
S32_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
106                           const SkPMColor* SK_RESTRICT src, int count,
107                           U8CPU alpha, int /*x*/, int /*y*/) {
108     SkASSERT(255 > alpha);
109 
110     uint16x8_t vmask_blue, vscale;
111 
112     // prepare constants
113     vscale = vdupq_n_u16(SkAlpha255To256(alpha));
114     vmask_blue = vmovq_n_u16(0x1F);
115 
116     while (count >= 8) {
117         uint8x8x4_t vsrc;
118         uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
119         uint16x8_t vres_r, vres_g, vres_b;
120 
121         // Load src
122 #ifdef SK_CPU_ARM64
123         vsrc = sk_vld4_u8_arm64_3(src);
124 #else
125         {
126         register uint8x8_t d0 asm("d0");
127         register uint8x8_t d1 asm("d1");
128         register uint8x8_t d2 asm("d2");
129         register uint8x8_t d3 asm("d3");
130 
131         asm (
132             "vld4.8    {d0-d3},[%[src]]!"
133             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
134             :
135         );
136         vsrc.val[0] = d0;
137         vsrc.val[1] = d1;
138         vsrc.val[2] = d2;
139         }
140 #endif
141 
142         // Load and unpack dst
143         vdst = vld1q_u16(dst);
144         vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
145         vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
146         vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
147         vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
148 
149         // Shift src to 565 range
150         vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
151         vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
152         vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
153 
154         // Scale src - dst
155         vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
156         vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
157         vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
158 
159         vres_r = vshrq_n_u16(vres_r * vscale, 8);
160         vres_g = vshrq_n_u16(vres_g * vscale, 8);
161         vres_b = vshrq_n_u16(vres_b * vscale, 8);
162 
163         vres_r += vdst_r;
164         vres_g += vdst_g;
165         vres_b += vdst_b;
166 
167         // Combine
168         vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
169         vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
170 
171         // Store
172         vst1q_u16(dst, vres_b);
173         dst += 8;
174         count -= 8;
175     }
176     if (count > 0) {
177         int scale = SkAlpha255To256(alpha);
178         do {
179             SkPMColor c = *src++;
180             SkPMColorAssert(c);
181             uint16_t d = *dst;
182             *dst++ = SkPackRGB16(
183                     SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
184                     SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
185                     SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
186         } while (--count != 0);
187     }
188 }
189 
190 #ifdef SK_CPU_ARM32
S32A_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
192                            const SkPMColor* SK_RESTRICT src, int count,
193                            U8CPU alpha, int /*x*/, int /*y*/) {
194     SkASSERT(255 == alpha);
195 
196     if (count >= 8) {
197         uint16_t* SK_RESTRICT keep_dst = 0;
198 
199         asm volatile (
200                       "ands       ip, %[count], #7            \n\t"
201                       "vmov.u8    d31, #1<<7                  \n\t"
202                       "vld1.16    {q12}, [%[dst]]             \n\t"
203                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
204                       // Thumb does not support the standard ARM conditional
205                       // instructions but instead requires the 'it' instruction
206                       // to signal conditional execution
207                       "it eq                                  \n\t"
208                       "moveq      ip, #8                      \n\t"
209                       "mov        %[keep_dst], %[dst]         \n\t"
210 
211                       "add        %[src], %[src], ip, LSL#2   \n\t"
212                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
213                       "subs       %[count], %[count], ip      \n\t"
214                       "b          9f                          \n\t"
215                       // LOOP
216                       "2:                                         \n\t"
217 
218                       "vld1.16    {q12}, [%[dst]]!            \n\t"
219                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
220                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
221                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
222                       "subs       %[count], %[count], #8      \n\t"
223                       "9:                                         \n\t"
224                       "pld        [%[dst],#32]                \n\t"
225                       // expand 0565 q12 to 8888 {d4-d7}
226                       "vmovn.u16  d4, q12                     \n\t"
227                       "vshr.u16   q11, q12, #5                \n\t"
228                       "vshr.u16   q10, q12, #6+5              \n\t"
229                       "vmovn.u16  d5, q11                     \n\t"
230                       "vmovn.u16  d6, q10                     \n\t"
231                       "vshl.u8    d4, d4, #3                  \n\t"
232                       "vshl.u8    d5, d5, #2                  \n\t"
233                       "vshl.u8    d6, d6, #3                  \n\t"
234 
235                       "vmovl.u8   q14, d31                    \n\t"
236                       "vmovl.u8   q13, d31                    \n\t"
237                       "vmovl.u8   q12, d31                    \n\t"
238 
239                       // duplicate in 4/2/1 & 8pix vsns
240                       "vmvn.8     d30, d3                     \n\t"
241                       "vmlal.u8   q14, d30, d6                \n\t"
242                       "vmlal.u8   q13, d30, d5                \n\t"
243                       "vmlal.u8   q12, d30, d4                \n\t"
244                       "vshr.u16   q8, q14, #5                 \n\t"
245                       "vshr.u16   q9, q13, #6                 \n\t"
246                       "vaddhn.u16 d6, q14, q8                 \n\t"
247                       "vshr.u16   q8, q12, #5                 \n\t"
248                       "vaddhn.u16 d5, q13, q9                 \n\t"
249                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
250                       "vaddhn.u16 d4, q12, q8                 \n\t"
251                       // intentionally don't calculate alpha
252                       // result in d4-d6
253 
254                       "vqadd.u8   d5, d5, d1                  \n\t"
255                       "vqadd.u8   d4, d4, d2                  \n\t"
256 
257                       // pack 8888 {d4-d6} to 0565 q10
258                       "vshll.u8   q10, d6, #8                 \n\t"
259                       "vshll.u8   q3, d5, #8                  \n\t"
260                       "vshll.u8   q2, d4, #8                  \n\t"
261                       "vsri.u16   q10, q3, #5                 \n\t"
262                       "vsri.u16   q10, q2, #11                \n\t"
263 
264                       "bne        2b                          \n\t"
265 
266                       "1:                                         \n\t"
267                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
268                       : [count] "+r" (count)
269                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
270                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
271                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
272                       "d30","d31"
273                       );
274     }
275     else
276     {   // handle count < 8
277         uint16_t* SK_RESTRICT keep_dst = 0;
278 
279         asm volatile (
280                       "vmov.u8    d31, #1<<7                  \n\t"
281                       "mov        %[keep_dst], %[dst]         \n\t"
282 
283                       "tst        %[count], #4                \n\t"
284                       "beq        14f                         \n\t"
285                       "vld1.16    {d25}, [%[dst]]!            \n\t"
286                       "vld1.32    {q1}, [%[src]]!             \n\t"
287 
288                       "14:                                        \n\t"
289                       "tst        %[count], #2                \n\t"
290                       "beq        12f                         \n\t"
291                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
292                       "vld1.32    {d1}, [%[src]]!             \n\t"
293 
294                       "12:                                        \n\t"
295                       "tst        %[count], #1                \n\t"
296                       "beq        11f                         \n\t"
297                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
298                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
299 
300                       "11:                                        \n\t"
301                       // unzips achieve the same as a vld4 operation
302                       "vuzpq.u16  q0, q1                      \n\t"
303                       "vuzp.u8    d0, d1                      \n\t"
304                       "vuzp.u8    d2, d3                      \n\t"
305                       // expand 0565 q12 to 8888 {d4-d7}
306                       "vmovn.u16  d4, q12                     \n\t"
307                       "vshr.u16   q11, q12, #5                \n\t"
308                       "vshr.u16   q10, q12, #6+5              \n\t"
309                       "vmovn.u16  d5, q11                     \n\t"
310                       "vmovn.u16  d6, q10                     \n\t"
311                       "vshl.u8    d4, d4, #3                  \n\t"
312                       "vshl.u8    d5, d5, #2                  \n\t"
313                       "vshl.u8    d6, d6, #3                  \n\t"
314 
315                       "vmovl.u8   q14, d31                    \n\t"
316                       "vmovl.u8   q13, d31                    \n\t"
317                       "vmovl.u8   q12, d31                    \n\t"
318 
319                       // duplicate in 4/2/1 & 8pix vsns
320                       "vmvn.8     d30, d3                     \n\t"
321                       "vmlal.u8   q14, d30, d6                \n\t"
322                       "vmlal.u8   q13, d30, d5                \n\t"
323                       "vmlal.u8   q12, d30, d4                \n\t"
324                       "vshr.u16   q8, q14, #5                 \n\t"
325                       "vshr.u16   q9, q13, #6                 \n\t"
326                       "vaddhn.u16 d6, q14, q8                 \n\t"
327                       "vshr.u16   q8, q12, #5                 \n\t"
328                       "vaddhn.u16 d5, q13, q9                 \n\t"
329                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
330                       "vaddhn.u16 d4, q12, q8                 \n\t"
331                       // intentionally don't calculate alpha
332                       // result in d4-d6
333 
334                       "vqadd.u8   d5, d5, d1                  \n\t"
335                       "vqadd.u8   d4, d4, d2                  \n\t"
336 
337                       // pack 8888 {d4-d6} to 0565 q10
338                       "vshll.u8   q10, d6, #8                 \n\t"
339                       "vshll.u8   q3, d5, #8                  \n\t"
340                       "vshll.u8   q2, d4, #8                  \n\t"
341                       "vsri.u16   q10, q3, #5                 \n\t"
342                       "vsri.u16   q10, q2, #11                \n\t"
343 
344                       // store
345                       "tst        %[count], #4                \n\t"
346                       "beq        24f                         \n\t"
347                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
348 
349                       "24:                                        \n\t"
350                       "tst        %[count], #2                \n\t"
351                       "beq        22f                         \n\t"
352                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
353 
354                       "22:                                        \n\t"
355                       "tst        %[count], #1                \n\t"
356                       "beq        21f                         \n\t"
357                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
358 
359                       "21:                                        \n\t"
360                       : [count] "+r" (count)
361                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
362                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
363                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
364                       "d30","d31"
365                       );
366     }
367 }
368 #endif
369 
SkDiv255Round_neon8(uint16x8_t prod)370 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
371     prod += vdupq_n_u16(128);
372     prod += vshrq_n_u16(prod, 8);
373     return vshrq_n_u16(prod, 8);
374 }
375 
S32A_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)376 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
377                           const SkPMColor* SK_RESTRICT src, int count,
378                           U8CPU alpha, int /*x*/, int /*y*/) {
379    SkASSERT(255 > alpha);
380 
381     /* This code implements a Neon version of S32A_D565_Blend. The results have
382      * a few mismatches compared to the original code. These mismatches never
383      * exceed 1.
384      */
385 
386     if (count >= 8) {
387         uint16x8_t valpha_max, vmask_blue;
388         uint8x8_t valpha;
389 
390         // prepare constants
391         valpha_max = vmovq_n_u16(255);
392         valpha = vdup_n_u8(alpha);
393         vmask_blue = vmovq_n_u16(SK_B16_MASK);
394 
395         do {
396             uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
397             uint16x8_t vres_a, vres_r, vres_g, vres_b;
398             uint8x8x4_t vsrc;
399 
400             // load pixels
401             vdst = vld1q_u16(dst);
402 #ifdef SK_CPU_ARM64
403             vsrc = sk_vld4_u8_arm64_4(src);
404 #else
405 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
406             asm (
407                 "vld4.u8 %h[vsrc], [%[src]]!"
408                 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
409                 : :
410             );
411 #else
412             register uint8x8_t d0 asm("d0");
413             register uint8x8_t d1 asm("d1");
414             register uint8x8_t d2 asm("d2");
415             register uint8x8_t d3 asm("d3");
416 
417             asm volatile (
418                 "vld4.u8    {d0-d3},[%[src]]!;"
419                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
420                   [src] "+&r" (src)
421                 : :
422             );
423             vsrc.val[0] = d0;
424             vsrc.val[1] = d1;
425             vsrc.val[2] = d2;
426             vsrc.val[3] = d3;
427 #endif
428 #endif // #ifdef SK_CPU_ARM64
429 
430 
431             // deinterleave dst
432             vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
433             vdst_b = vdst & vmask_blue;                     // extract blue
434             vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
435             vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
436 
437             // shift src to 565
438             vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
439             vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
440             vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
441 
442             // calc src * src_scale
443             vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
444             vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
445             vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
446             vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
447 
448             // prepare dst_scale
449             vres_a = SkDiv255Round_neon8(vres_a);
450             vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
451 
452             // add dst * dst_scale to previous result
453             vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
454             vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
455             vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
456 
457 #ifdef S32A_D565_BLEND_EXACT
458             // It is possible to get exact results with this but it is slow,
459             // even slower than C code in some cases
460             vres_r = SkDiv255Round_neon8(vres_r);
461             vres_g = SkDiv255Round_neon8(vres_g);
462             vres_b = SkDiv255Round_neon8(vres_b);
463 #else
464             vres_r = vrshrq_n_u16(vres_r, 8);
465             vres_g = vrshrq_n_u16(vres_g, 8);
466             vres_b = vrshrq_n_u16(vres_b, 8);
467 #endif
468             // pack result
469             vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
470             vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
471 
472             // store
473             vst1q_u16(dst, vres_b);
474             dst += 8;
475             count -= 8;
476         } while (count >= 8);
477     }
478 
479     // leftovers
480     while (count-- > 0) {
481         SkPMColor sc = *src++;
482         if (sc) {
483             uint16_t dc = *dst;
484             unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
485             unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
486             unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
487             unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
488             *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
489         }
490         dst += 1;
491     }
492 }
493 
494 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
495  * each dither value is spaced out into byte lanes, and repeated
496  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
497  * start of each row.
498  */
499 static const uint8_t gDitherMatrix_Neon[48] = {
500     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
501     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
502     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
503     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
504 
505 };
506 
S32_D565_Blend_Dither_neon(uint16_t * dst,const SkPMColor * src,int count,U8CPU alpha,int x,int y)507 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
508                                 int count, U8CPU alpha, int x, int y)
509 {
510 
511     SkASSERT(255 > alpha);
512 
513     // rescale alpha to range 1 - 256
514     int scale = SkAlpha255To256(alpha);
515 
516     if (count >= 8) {
517         /* select row and offset for dither array */
518         const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
519 
520         uint8x8_t vdither = vld1_u8(dstart);         // load dither values
521         uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
522 
523         int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
524         uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
525 
526         do {
527 
528             uint8x8x4_t vsrc;
529             uint8x8_t vsrc_r, vsrc_g, vsrc_b;
530             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
531             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
532             uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
533             uint16x8_t vdst;
534             uint16x8_t vdst_r, vdst_g, vdst_b;
535             int16x8_t vres_r, vres_g, vres_b;
536             int8x8_t vres8_r, vres8_g, vres8_b;
537 
538             // Load source and add dither
539 #ifdef SK_CPU_ARM64
540             vsrc = sk_vld4_u8_arm64_3(src);
541 #else
542             {
543             register uint8x8_t d0 asm("d0");
544             register uint8x8_t d1 asm("d1");
545             register uint8x8_t d2 asm("d2");
546             register uint8x8_t d3 asm("d3");
547 
548             asm (
549                 "vld4.8    {d0-d3},[%[src]]! "
550                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
551                 :
552             );
553             vsrc.val[0] = d0;
554             vsrc.val[1] = d1;
555             vsrc.val[2] = d2;
556             }
557 #endif
558             vsrc_r = vsrc.val[NEON_R];
559             vsrc_g = vsrc.val[NEON_G];
560             vsrc_b = vsrc.val[NEON_B];
561 
562             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
563             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
564             vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
565 
566             vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
567             vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
568             vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
569 
570             vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
571             vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
572             vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
573 
574             vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
575             vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
576             vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
577 
578             // Load dst and unpack
579             vdst = vld1q_u16(dst);
580             vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
581             vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
582             vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
583 
584             // subtract dst from src and widen
585             vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
586             vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
587             vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
588 
589             // multiply diffs by scale and shift
590             vres_r = vmulq_s16(vres_r, vscale);
591             vres_g = vmulq_s16(vres_g, vscale);
592             vres_b = vmulq_s16(vres_b, vscale);
593 
594             vres8_r = vshrn_n_s16(vres_r, 8);
595             vres8_g = vshrn_n_s16(vres_g, 8);
596             vres8_b = vshrn_n_s16(vres_b, 8);
597 
598             // add dst to result
599             vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
600             vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
601             vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
602 
603             // put result into 565 format
604             vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
605             vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
606 
607             // Store result
608             vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
609 
610             // Next iteration
611             dst += 8;
612             count -= 8;
613 
614         } while (count >= 8);
615     }
616 
617     // Leftovers
618     if (count > 0) {
619         int scale = SkAlpha255To256(alpha);
620         DITHER_565_SCAN(y);
621         do {
622             SkPMColor c = *src++;
623             SkPMColorAssert(c);
624 
625             int dither = DITHER_VALUE(x);
626             int sr = SkGetPackedR32(c);
627             int sg = SkGetPackedG32(c);
628             int sb = SkGetPackedB32(c);
629             sr = SkDITHER_R32To565(sr, dither);
630             sg = SkDITHER_G32To565(sg, dither);
631             sb = SkDITHER_B32To565(sb, dither);
632 
633             uint16_t d = *dst;
634             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
635                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale),
636                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale));
637             DITHER_INC_X(x);
638         } while (--count != 0);
639     }
640 }
641 
S32A_Opaque_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)642 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
643                                 const SkPMColor* SK_RESTRICT src,
644                                 int count, U8CPU alpha) {
645 
646     SkASSERT(255 == alpha);
647     if (count > 0) {
648 
649 
650     uint8x8_t alpha_mask;
651 
652     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
653     alpha_mask = vld1_u8(alpha_mask_setup);
654 
655     /* do the NEON unrolled code */
656 #define    UNROLL    4
657     while (count >= UNROLL) {
658         uint8x8_t src_raw, dst_raw, dst_final;
659         uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
660 
661         /* The two prefetches below may make the code slighlty
662          * slower for small values of count but are worth having
663          * in the general case.
664          */
665         __builtin_prefetch(src+32);
666         __builtin_prefetch(dst+32);
667 
668         /* get the source */
669         src_raw = vreinterpret_u8_u32(vld1_u32(src));
670 #if    UNROLL > 2
671         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
672 #endif
673 
674         /* get and hold the dst too */
675         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
676 #if    UNROLL > 2
677         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
678 #endif
679 
680     /* 1st and 2nd bits of the unrolling */
681     {
682         uint8x8_t dst_cooked;
683         uint16x8_t dst_wide;
684         uint8x8_t alpha_narrow;
685         uint16x8_t alpha_wide;
686 
687         /* get the alphas spread out properly */
688         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
689         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
690 
691         /* spread the dest */
692         dst_wide = vmovl_u8(dst_raw);
693 
694         /* alpha mul the dest */
695         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
696         dst_cooked = vshrn_n_u16(dst_wide, 8);
697 
698         /* sum -- ignoring any byte lane overflows */
699         dst_final = vadd_u8(src_raw, dst_cooked);
700     }
701 
702 #if    UNROLL > 2
703     /* the 3rd and 4th bits of our unrolling */
704     {
705         uint8x8_t dst_cooked;
706         uint16x8_t dst_wide;
707         uint8x8_t alpha_narrow;
708         uint16x8_t alpha_wide;
709 
710         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
711         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
712 
713         /* spread the dest */
714         dst_wide = vmovl_u8(dst_raw_2);
715 
716         /* alpha mul the dest */
717         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
718         dst_cooked = vshrn_n_u16(dst_wide, 8);
719 
720         /* sum -- ignoring any byte lane overflows */
721         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
722     }
723 #endif
724 
725         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
726 #if    UNROLL > 2
727         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
728 #endif
729 
730         src += UNROLL;
731         dst += UNROLL;
732         count -= UNROLL;
733     }
734 #undef    UNROLL
735 
736     /* do any residual iterations */
737         while (--count >= 0) {
738             *dst = SkPMSrcOver(*src, *dst);
739             src += 1;
740             dst += 1;
741         }
742     }
743 }
744 
S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)745 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
746                                 const SkPMColor* SK_RESTRICT src,
747                                 int count, U8CPU alpha) {
748     SkASSERT(255 == alpha);
749 
750     if (count <= 0)
751     return;
752 
753     /* Use these to check if src is transparent or opaque */
754     const unsigned int ALPHA_OPAQ  = 0xFF000000;
755     const unsigned int ALPHA_TRANS = 0x00FFFFFF;
756 
757 #define UNROLL  4
758     const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
759     const SkPMColor* SK_RESTRICT src_temp = src;
760 
761     /* set up the NEON variables */
762     uint8x8_t alpha_mask;
763     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
764     alpha_mask = vld1_u8(alpha_mask_setup);
765 
766     uint8x8_t src_raw, dst_raw, dst_final;
767     uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
768     uint8x8_t dst_cooked;
769     uint16x8_t dst_wide;
770     uint8x8_t alpha_narrow;
771     uint16x8_t alpha_wide;
772 
773     /* choose the first processing type */
774     if( src >= src_end)
775         goto TAIL;
776     if(*src <= ALPHA_TRANS)
777         goto ALPHA_0;
778     if(*src >= ALPHA_OPAQ)
779         goto ALPHA_255;
780     /* fall-thru */
781 
782 ALPHA_1_TO_254:
783     do {
784 
785         /* get the source */
786         src_raw = vreinterpret_u8_u32(vld1_u32(src));
787         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
788 
789         /* get and hold the dst too */
790         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
791         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
792 
793 
794         /* get the alphas spread out properly */
795         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
796         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
797         /* we collapsed (255-a)+1 ... */
798         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
799 
800         /* spread the dest */
801         dst_wide = vmovl_u8(dst_raw);
802 
803         /* alpha mul the dest */
804         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
805         dst_cooked = vshrn_n_u16(dst_wide, 8);
806 
807         /* sum -- ignoring any byte lane overflows */
808         dst_final = vadd_u8(src_raw, dst_cooked);
809 
810         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
811         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
812         /* we collapsed (255-a)+1 ... */
813         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
814 
815         /* spread the dest */
816         dst_wide = vmovl_u8(dst_raw_2);
817 
818         /* alpha mul the dest */
819         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
820         dst_cooked = vshrn_n_u16(dst_wide, 8);
821 
822         /* sum -- ignoring any byte lane overflows */
823         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
824 
825         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
826         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
827 
828         src += UNROLL;
829         dst += UNROLL;
830 
831         /* if 2 of the next pixels aren't between 1 and 254
832         it might make sense to go to the optimized loops */
833         if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
834             break;
835 
836     } while(src < src_end);
837 
838     if (src >= src_end)
839         goto TAIL;
840 
841     if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
842         goto ALPHA_255;
843 
844     /*fall-thru*/
845 
846 ALPHA_0:
847 
848     /*In this state, we know the current alpha is 0 and
849      we optimize for the next alpha also being zero. */
850     src_temp = src;  //so we don't have to increment dst every time
851     do {
852         if(*(++src) > ALPHA_TRANS)
853             break;
854         if(*(++src) > ALPHA_TRANS)
855             break;
856         if(*(++src) > ALPHA_TRANS)
857             break;
858         if(*(++src) > ALPHA_TRANS)
859             break;
860     } while(src < src_end);
861 
862     dst += (src - src_temp);
863 
864     /* no longer alpha 0, so determine where to go next. */
865     if( src >= src_end)
866         goto TAIL;
867     if(*src >= ALPHA_OPAQ)
868         goto ALPHA_255;
869     else
870         goto ALPHA_1_TO_254;
871 
872 ALPHA_255:
873     while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
874         dst[0]=src[0];
875         dst[1]=src[1];
876         dst[2]=src[2];
877         dst[3]=src[3];
878         src+=UNROLL;
879         dst+=UNROLL;
880         if(src >= src_end)
881             goto TAIL;
882     }
883 
884     //Handle remainder.
885     if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
886         if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
887             if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
888         }
889     }
890 
891     if( src >= src_end)
892         goto TAIL;
893     if(*src <= ALPHA_TRANS)
894         goto ALPHA_0;
895     else
896         goto ALPHA_1_TO_254;
897 
898 TAIL:
899     /* do any residual iterations */
900     src_end += UNROLL + 1;  //goto the real end
901     while(src != src_end) {
902         if( *src != 0 ) {
903             if( *src >= ALPHA_OPAQ ) {
904                 *dst = *src;
905             }
906             else {
907                 *dst = SkPMSrcOver(*src, *dst);
908             }
909         }
910         src++;
911         dst++;
912     }
913 
914 #undef    UNROLL
915     return;
916 }
917 
918 /* Neon version of S32_Blend_BlitRow32()
919  * portable version is in src/core/SkBlitRow_D32.cpp
920  */
S32_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)921 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
922                               const SkPMColor* SK_RESTRICT src,
923                               int count, U8CPU alpha) {
924     SkASSERT(alpha <= 255);
925 
926     if (count <= 0) {
927         return;
928     }
929 
930     uint16_t src_scale = SkAlpha255To256(alpha);
931     uint16_t dst_scale = 256 - src_scale;
932 
933     while (count >= 2) {
934         uint8x8_t vsrc, vdst, vres;
935         uint16x8_t vsrc_wide, vdst_wide;
936 
937         /* These commented prefetches are a big win for count
938          * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
939          * They also hurt a little (<5%) on an A15
940          */
941         //__builtin_prefetch(src+32);
942         //__builtin_prefetch(dst+32);
943 
944         // Load
945         vsrc = vreinterpret_u8_u32(vld1_u32(src));
946         vdst = vreinterpret_u8_u32(vld1_u32(dst));
947 
948         // Process src
949         vsrc_wide = vmovl_u8(vsrc);
950         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
951 
952         // Process dst
953         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
954 
955         // Combine
956         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
957 
958         // Store
959         vst1_u32(dst, vreinterpret_u32_u8(vres));
960 
961         src += 2;
962         dst += 2;
963         count -= 2;
964     }
965 
966     if (count == 1) {
967         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
968         uint16x8_t vsrc_wide, vdst_wide;
969 
970         // Load
971         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
972         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
973 
974         // Process
975         vsrc_wide = vmovl_u8(vsrc);
976         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
977         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
978         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
979 
980         // Store
981         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
982     }
983 }
984 
985 #ifdef SK_CPU_ARM32
S32A_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)986 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
987                          const SkPMColor* SK_RESTRICT src,
988                          int count, U8CPU alpha) {
989 
990     SkASSERT(255 >= alpha);
991 
992     if (count <= 0) {
993         return;
994     }
995 
996     unsigned alpha256 = SkAlpha255To256(alpha);
997 
998     // First deal with odd counts
999     if (count & 1) {
1000         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1001         uint16x8_t vdst_wide, vsrc_wide;
1002         unsigned dst_scale;
1003 
1004         // Load
1005         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1006         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1007 
1008         // Calc dst_scale
1009         dst_scale = vget_lane_u8(vsrc, 3);
1010         dst_scale *= alpha256;
1011         dst_scale >>= 8;
1012         dst_scale = 256 - dst_scale;
1013 
1014         // Process src
1015         vsrc_wide = vmovl_u8(vsrc);
1016         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
1017 
1018         // Process dst
1019         vdst_wide = vmovl_u8(vdst);
1020         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
1021 
1022         // Combine
1023         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1024 
1025         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1026         dst++;
1027         src++;
1028         count--;
1029     }
1030 
1031     if (count) {
1032         uint8x8_t alpha_mask;
1033         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
1034         alpha_mask = vld1_u8(alpha_mask_setup);
1035 
1036         do {
1037 
1038             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
1039             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
1040 
1041             __builtin_prefetch(src+32);
1042             __builtin_prefetch(dst+32);
1043 
1044             // Load
1045             vsrc = vreinterpret_u8_u32(vld1_u32(src));
1046             vdst = vreinterpret_u8_u32(vld1_u32(dst));
1047 
1048             // Prepare src_scale
1049             vsrc_scale = vdupq_n_u16(alpha256);
1050 
1051             // Calc dst_scale
1052             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
1053             vdst_scale = vmovl_u8(vsrc_alphas);
1054             vdst_scale *= vsrc_scale;
1055             vdst_scale = vshrq_n_u16(vdst_scale, 8);
1056             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
1057 
1058             // Process src
1059             vsrc_wide = vmovl_u8(vsrc);
1060             vsrc_wide *= vsrc_scale;
1061 
1062             // Process dst
1063             vdst_wide = vmovl_u8(vdst);
1064             vdst_wide *= vdst_scale;
1065 
1066             // Combine
1067             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1068 
1069             vst1_u32(dst, vreinterpret_u32_u8(vres));
1070 
1071             src += 2;
1072             dst += 2;
1073             count -= 2;
1074         } while(count);
1075     }
1076 }
1077 
1078 ///////////////////////////////////////////////////////////////////////////////
1079 
1080 #undef    DEBUG_OPAQUE_DITHER
1081 
1082 #if    defined(DEBUG_OPAQUE_DITHER)
showme8(char * str,void * p,int len)1083 static void showme8(char *str, void *p, int len)
1084 {
1085     static char buf[256];
1086     char tbuf[32];
1087     int i;
1088     char *pc = (char*) p;
1089     sprintf(buf,"%8s:", str);
1090     for(i=0;i<len;i++) {
1091         sprintf(tbuf, "   %02x", pc[i]);
1092         strcat(buf, tbuf);
1093     }
1094     SkDebugf("%s\n", buf);
1095 }
showme16(char * str,void * p,int len)1096 static void showme16(char *str, void *p, int len)
1097 {
1098     static char buf[256];
1099     char tbuf[32];
1100     int i;
1101     uint16_t *pc = (uint16_t*) p;
1102     sprintf(buf,"%8s:", str);
1103     len = (len / sizeof(uint16_t));    /* passed as bytes */
1104     for(i=0;i<len;i++) {
1105         sprintf(tbuf, " %04x", pc[i]);
1106         strcat(buf, tbuf);
1107     }
1108     SkDebugf("%s\n", buf);
1109 }
1110 #endif
1111 #endif // #ifdef SK_CPU_ARM32
1112 
S32A_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1113 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1114                                    const SkPMColor* SK_RESTRICT src,
1115                                    int count, U8CPU alpha, int x, int y) {
1116     SkASSERT(255 == alpha);
1117 
1118 #define    UNROLL    8
1119 
1120     if (count >= UNROLL) {
1121 
1122 #if defined(DEBUG_OPAQUE_DITHER)
1123     uint16_t tmpbuf[UNROLL];
1124     int td[UNROLL];
1125     int tdv[UNROLL];
1126     int ta[UNROLL];
1127     int tap[UNROLL];
1128     uint16_t in_dst[UNROLL];
1129     int offset = 0;
1130     int noisy = 0;
1131 #endif
1132 
1133     uint8x8_t dbase;
1134     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1135     dbase = vld1_u8(dstart);
1136 
1137         do {
1138         uint8x8x4_t vsrc;
1139         uint8x8_t sr, sg, sb, sa, d;
1140         uint16x8_t dst8, scale8, alpha8;
1141         uint16x8_t dst_r, dst_g, dst_b;
1142 
1143 #if defined(DEBUG_OPAQUE_DITHER)
1144         // calculate 8 elements worth into a temp buffer
1145         {
1146         int my_y = y;
1147         int my_x = x;
1148         SkPMColor* my_src = (SkPMColor*)src;
1149         uint16_t* my_dst = dst;
1150         int i;
1151 
1152         DITHER_565_SCAN(my_y);
1153         for(i = 0; i < UNROLL; i++) {
1154             SkPMColor c = *my_src++;
1155             SkPMColorAssert(c);
1156             if (c) {
1157                 unsigned a = SkGetPackedA32(c);
1158 
1159                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1160                 tdv[i] = DITHER_VALUE(my_x);
1161                 ta[i] = a;
1162                 tap[i] = SkAlpha255To256(a);
1163                 td[i] = d;
1164 
1165                 unsigned sr = SkGetPackedR32(c);
1166                 unsigned sg = SkGetPackedG32(c);
1167                 unsigned sb = SkGetPackedB32(c);
1168                 sr = SkDITHER_R32_FOR_565(sr, d);
1169                 sg = SkDITHER_G32_FOR_565(sg, d);
1170                 sb = SkDITHER_B32_FOR_565(sb, d);
1171 
1172                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1173                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1174                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1175                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1176                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1177                 td[i] = d;
1178             } else {
1179                 tmpbuf[i] = *my_dst;
1180                 ta[i] = tdv[i] = td[i] = 0xbeef;
1181             }
1182             in_dst[i] = *my_dst;
1183             my_dst += 1;
1184             DITHER_INC_X(my_x);
1185         }
1186         }
1187 #endif
1188 
1189 #ifdef SK_CPU_ARM64
1190         vsrc = sk_vld4_u8_arm64_4(src);
1191 #else
1192         {
1193         register uint8x8_t d0 asm("d0");
1194         register uint8x8_t d1 asm("d1");
1195         register uint8x8_t d2 asm("d2");
1196         register uint8x8_t d3 asm("d3");
1197 
1198         asm ("vld4.8    {d0-d3},[%[src]]! "
1199             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1200             :
1201         );
1202         vsrc.val[0] = d0;
1203         vsrc.val[1] = d1;
1204         vsrc.val[2] = d2;
1205         vsrc.val[3] = d3;
1206         }
1207 #endif
1208         sa = vsrc.val[NEON_A];
1209         sr = vsrc.val[NEON_R];
1210         sg = vsrc.val[NEON_G];
1211         sb = vsrc.val[NEON_B];
1212 
1213         /* calculate 'd', which will be 0..7
1214          * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1215          */
1216         alpha8 = vmovl_u8(dbase);
1217         alpha8 = vmlal_u8(alpha8, sa, dbase);
1218         d = vshrn_n_u16(alpha8, 8);    // narrowing too
1219 
1220         // sr = sr - (sr>>5) + d
1221         /* watching for 8-bit overflow.  d is 0..7; risky range of
1222          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1223          * safe  as long as we do ((sr-sr>>5) + d)
1224          */
1225         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1226         sr = vadd_u8(sr, d);
1227 
1228         // sb = sb - (sb>>5) + d
1229         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1230         sb = vadd_u8(sb, d);
1231 
1232         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1233         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1234         sg = vadd_u8(sg, vshr_n_u8(d,1));
1235 
1236         // need to pick up 8 dst's -- at 16 bits each, 128 bits
1237         dst8 = vld1q_u16(dst);
1238         dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1239         dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1240         dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
1241 
1242         // blend
1243         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1244 
1245         // combine the addq and mul, save 3 insns
1246         scale8 = vshrq_n_u16(scale8, 3);
1247         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1248         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1249         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1250 
1251         // repack to store
1252         dst8 = vshrq_n_u16(dst_b, 5);
1253         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1254         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1255 
1256         vst1q_u16(dst, dst8);
1257 
1258 #if defined(DEBUG_OPAQUE_DITHER)
1259         // verify my 8 elements match the temp buffer
1260         {
1261         int i, bad=0;
1262         static int invocation;
1263 
1264         for (i = 0; i < UNROLL; i++) {
1265             if (tmpbuf[i] != dst[i]) {
1266                 bad=1;
1267             }
1268         }
1269         if (bad) {
1270             SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1271                      invocation, offset);
1272             SkDebugf("  alpha 0x%x\n", alpha);
1273             for (i = 0; i < UNROLL; i++)
1274                 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1275                          i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
1276                          in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
1277 
1278             showme16("alpha8", &alpha8, sizeof(alpha8));
1279             showme16("scale8", &scale8, sizeof(scale8));
1280             showme8("d", &d, sizeof(d));
1281             showme16("dst8", &dst8, sizeof(dst8));
1282             showme16("dst_b", &dst_b, sizeof(dst_b));
1283             showme16("dst_g", &dst_g, sizeof(dst_g));
1284             showme16("dst_r", &dst_r, sizeof(dst_r));
1285             showme8("sb", &sb, sizeof(sb));
1286             showme8("sg", &sg, sizeof(sg));
1287             showme8("sr", &sr, sizeof(sr));
1288 
1289             return;
1290         }
1291         offset += UNROLL;
1292         invocation++;
1293         }
1294 #endif
1295         dst += UNROLL;
1296         count -= UNROLL;
1297         // skip x += UNROLL, since it's unchanged mod-4
1298         } while (count >= UNROLL);
1299     }
1300 #undef    UNROLL
1301 
1302     // residuals
1303     if (count > 0) {
1304         DITHER_565_SCAN(y);
1305         do {
1306             SkPMColor c = *src++;
1307             SkPMColorAssert(c);
1308             if (c) {
1309                 unsigned a = SkGetPackedA32(c);
1310 
1311                 // dither and alpha are just temporary variables to work-around
1312                 // an ICE in debug.
1313                 unsigned dither = DITHER_VALUE(x);
1314                 unsigned alpha = SkAlpha255To256(a);
1315                 int d = SkAlphaMul(dither, alpha);
1316 
1317                 unsigned sr = SkGetPackedR32(c);
1318                 unsigned sg = SkGetPackedG32(c);
1319                 unsigned sb = SkGetPackedB32(c);
1320                 sr = SkDITHER_R32_FOR_565(sr, d);
1321                 sg = SkDITHER_G32_FOR_565(sg, d);
1322                 sb = SkDITHER_B32_FOR_565(sb, d);
1323 
1324                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1325                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1326                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1327                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1328                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1329             }
1330             dst += 1;
1331             DITHER_INC_X(x);
1332         } while (--count != 0);
1333     }
1334 }
1335 
1336 ///////////////////////////////////////////////////////////////////////////////
1337 
1338 #undef    DEBUG_S32_OPAQUE_DITHER
1339 
S32_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1340 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1341                                  const SkPMColor* SK_RESTRICT src,
1342                                  int count, U8CPU alpha, int x, int y) {
1343     SkASSERT(255 == alpha);
1344 
1345 #define    UNROLL    8
1346     if (count >= UNROLL) {
1347     uint8x8_t d;
1348     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1349     d = vld1_u8(dstart);
1350 
1351     while (count >= UNROLL) {
1352         uint8x8_t sr, sg, sb;
1353         uint16x8_t dr, dg, db;
1354         uint16x8_t dst8;
1355         uint8x8x4_t vsrc;
1356 
1357 #ifdef SK_CPU_ARM64
1358         vsrc = sk_vld4_u8_arm64_3(src);
1359 #else
1360         {
1361         register uint8x8_t d0 asm("d0");
1362         register uint8x8_t d1 asm("d1");
1363         register uint8x8_t d2 asm("d2");
1364         register uint8x8_t d3 asm("d3");
1365 
1366         asm (
1367             "vld4.8    {d0-d3},[%[src]]! "
1368             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1369             :
1370         );
1371         vsrc.val[0] = d0;
1372         vsrc.val[1] = d1;
1373         vsrc.val[2] = d2;
1374         }
1375 #endif
1376         sr = vsrc.val[NEON_R];
1377         sg = vsrc.val[NEON_G];
1378         sb = vsrc.val[NEON_B];
1379 
1380         /* XXX: if we want to prefetch, hide it in the above asm()
1381          * using the gcc __builtin_prefetch(), the prefetch will
1382          * fall to the bottom of the loop -- it won't stick up
1383          * at the top of the loop, just after the vld4.
1384          */
1385 
1386         // sr = sr - (sr>>5) + d
1387         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1388         dr = vaddl_u8(sr, d);
1389 
1390         // sb = sb - (sb>>5) + d
1391         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1392         db = vaddl_u8(sb, d);
1393 
1394         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1395         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1396         dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1397 
1398         // pack high bits of each into 565 format  (rgb, b is lsb)
1399         dst8 = vshrq_n_u16(db, 3);
1400         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1401         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1402 
1403         // store it
1404         vst1q_u16(dst, dst8);
1405 
1406 #if    defined(DEBUG_S32_OPAQUE_DITHER)
1407         // always good to know if we generated good results
1408         {
1409         int i, myx = x, myy = y;
1410         DITHER_565_SCAN(myy);
1411         for (i=0;i<UNROLL;i++) {
1412             // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1413             SkPMColor c = src[i-8];
1414             unsigned dither = DITHER_VALUE(myx);
1415             uint16_t val = SkDitherRGB32To565(c, dither);
1416             if (val != dst[i]) {
1417             SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1418                 c, dither, val, dst[i], dstart[i]);
1419             }
1420             DITHER_INC_X(myx);
1421         }
1422         }
1423 #endif
1424 
1425         dst += UNROLL;
1426         // we don't need to increment src as the asm above has already done it
1427         count -= UNROLL;
1428         x += UNROLL;        // probably superfluous
1429     }
1430     }
1431 #undef    UNROLL
1432 
1433     // residuals
1434     if (count > 0) {
1435         DITHER_565_SCAN(y);
1436         do {
1437             SkPMColor c = *src++;
1438             SkPMColorAssert(c);
1439             SkASSERT(SkGetPackedA32(c) == 255);
1440 
1441             unsigned dither = DITHER_VALUE(x);
1442             *dst++ = SkDitherRGB32To565(c, dither);
1443             DITHER_INC_X(x);
1444         } while (--count != 0);
1445     }
1446 }
1447 
Color32_arm_neon(SkPMColor * dst,const SkPMColor * src,int count,SkPMColor color)1448 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1449                       SkPMColor color) {
1450     if (count <= 0) {
1451         return;
1452     }
1453 
1454     if (0 == color) {
1455         if (src != dst) {
1456             memcpy(dst, src, count * sizeof(SkPMColor));
1457         }
1458         return;
1459     }
1460 
1461     unsigned colorA = SkGetPackedA32(color);
1462     if (255 == colorA) {
1463         sk_memset32(dst, color, count);
1464         return;
1465     }
1466 
1467     unsigned scale = 256 - SkAlpha255To256(colorA);
1468 
1469     if (count >= 8) {
1470         uint32x4_t vcolor;
1471         uint8x8_t vscale;
1472 
1473         vcolor = vdupq_n_u32(color);
1474 
1475         // scale numerical interval [0-255], so load as 8 bits
1476         vscale = vdup_n_u8(scale);
1477 
1478         do {
1479             // load src color, 8 pixels, 4 64 bit registers
1480             // (and increment src).
1481             uint32x2x4_t vsrc;
1482 #if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
1483             asm (
1484                 "vld1.32    %h[vsrc], [%[src]]!"
1485                 : [vsrc] "=w" (vsrc), [src] "+r" (src)
1486                 : :
1487             );
1488 #else // 64bit targets and Clang
1489             vsrc.val[0] = vld1_u32(src);
1490             vsrc.val[1] = vld1_u32(src+2);
1491             vsrc.val[2] = vld1_u32(src+4);
1492             vsrc.val[3] = vld1_u32(src+6);
1493             src += 8;
1494 #endif
1495 
1496             // multiply long by scale, 64 bits at a time,
1497             // destination into a 128 bit register.
1498             uint16x8x4_t vtmp;
1499             vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);
1500             vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);
1501             vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);
1502             vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);
1503 
1504             // shift the 128 bit registers, containing the 16
1505             // bit scaled values back to 8 bits, narrowing the
1506             // results to 64 bit registers.
1507             uint8x16x2_t vres;
1508             vres.val[0] = vcombine_u8(
1509                             vshrn_n_u16(vtmp.val[0], 8),
1510                             vshrn_n_u16(vtmp.val[1], 8));
1511             vres.val[1] = vcombine_u8(
1512                             vshrn_n_u16(vtmp.val[2], 8),
1513                             vshrn_n_u16(vtmp.val[3], 8));
1514 
1515             // adding back the color, using 128 bit registers.
1516             uint32x4x2_t vdst;
1517             vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
1518                                                vreinterpretq_u8_u32(vcolor));
1519             vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
1520                                                vreinterpretq_u8_u32(vcolor));
1521 
1522             // store back the 8 calculated pixels (2 128 bit
1523             // registers), and increment dst.
1524 #if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
1525             asm (
1526                 "vst1.32    %h[vdst], [%[dst]]!"
1527                 : [dst] "+r" (dst)
1528                 : [vdst] "w" (vdst)
1529                 : "memory"
1530             );
1531 #else // 64bit targets and Clang
1532             vst1q_u32(dst, vdst.val[0]);
1533             vst1q_u32(dst+4, vdst.val[1]);
1534             dst += 8;
1535 #endif
1536             count -= 8;
1537 
1538         } while (count >= 8);
1539     }
1540 
1541     while (count > 0) {
1542         *dst = color + SkAlphaMulQ(*src, scale);
1543         src += 1;
1544         dst += 1;
1545         count--;
1546     }
1547 }
1548 
1549 ///////////////////////////////////////////////////////////////////////////////
1550 
1551 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1552     // no dither
1553     S32_D565_Opaque_neon,
1554     S32_D565_Blend_neon,
1555 #ifdef SK_CPU_ARM32
1556     S32A_D565_Opaque_neon,
1557 #else
1558     NULL,
1559 #endif
1560     S32A_D565_Blend_neon,
1561 
1562     // dither
1563     S32_D565_Opaque_Dither_neon,
1564     S32_D565_Blend_Dither_neon,
1565     S32A_D565_Opaque_Dither_neon,
1566     NULL,   // S32A_D565_Blend_Dither
1567 };
1568 
1569 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1570     NULL,   // S32_Opaque,
1571     S32_Blend_BlitRow32_neon,        // S32_Blend,
1572     /*
1573      * We have two choices for S32A_Opaque procs. The one reads the src alpha
1574      * value and attempts to optimize accordingly.  The optimization is
1575      * sensitive to the source content and is not a win in all cases. For
1576      * example, if there are a lot of transitions between the alpha states,
1577      * the performance will almost certainly be worse.  However, for many
1578      * common cases the performance is equivalent or better than the standard
1579      * case where we do not inspect the src alpha.
1580      */
1581 #if SK_A32_SHIFT == 24
1582     // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1583     S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1584 #else
1585     S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1586 #endif
1587 #ifdef SK_CPU_ARM32
1588     S32A_Blend_BlitRow32_neon        // S32A_Blend
1589 #else
1590     NULL
1591 #endif
1592 };
1593