• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "SkBlitRow_opts_arm_neon.h"
9 
10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h"
12 #include "SkColorPriv.h"
13 #include "SkDither.h"
14 #include "SkMathPriv.h"
15 #include "SkUtils.h"
16 
17 #include "SkColor_opts_neon.h"
18 #include <arm_neon.h>
19 
20 #ifdef SK_CPU_ARM64
sk_vld4_u8_arm64_3(const SkPMColor * SK_RESTRICT & src)21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
22     uint8x8x4_t vsrc;
23     uint8x8_t vsrc_0, vsrc_1, vsrc_2;
24 
25     asm (
26         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
27         "mov    %[vsrc0].8b, v0.8b             \t\n"
28         "mov    %[vsrc1].8b, v1.8b             \t\n"
29         "mov    %[vsrc2].8b, v2.8b             \t\n"
30         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
31           [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
32         : : "v0", "v1", "v2", "v3"
33     );
34 
35     vsrc.val[0] = vsrc_0;
36     vsrc.val[1] = vsrc_1;
37     vsrc.val[2] = vsrc_2;
38 
39     return vsrc;
40 }
41 
sk_vld4_u8_arm64_4(const SkPMColor * SK_RESTRICT & src)42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
43     uint8x8x4_t vsrc;
44     uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
45 
46     asm (
47         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
48         "mov    %[vsrc0].8b, v0.8b             \t\n"
49         "mov    %[vsrc1].8b, v1.8b             \t\n"
50         "mov    %[vsrc2].8b, v2.8b             \t\n"
51         "mov    %[vsrc3].8b, v3.8b             \t\n"
52         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
53           [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
54           [src] "+&r" (src)
55         : : "v0", "v1", "v2", "v3"
56     );
57 
58     vsrc.val[0] = vsrc_0;
59     vsrc.val[1] = vsrc_1;
60     vsrc.val[2] = vsrc_2;
61     vsrc.val[3] = vsrc_3;
62 
63     return vsrc;
64 }
65 #endif
66 
S32_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
68                            const SkPMColor* SK_RESTRICT src, int count,
69                            U8CPU alpha, int /*x*/, int /*y*/) {
70     SkASSERT(255 == alpha);
71 
72     while (count >= 8) {
73         uint8x8x4_t vsrc;
74         uint16x8_t vdst;
75 
76         // Load
77 #ifdef SK_CPU_ARM64
78         vsrc = sk_vld4_u8_arm64_3(src);
79 #else
80         vsrc = vld4_u8((uint8_t*)src);
81         src += 8;
82 #endif
83 
84         // Convert src to 565
85         vdst = SkPixel32ToPixel16_neon8(vsrc);
86 
87         // Store
88         vst1q_u16(dst, vdst);
89 
90         // Prepare next iteration
91         dst += 8;
92         count -= 8;
93     };
94 
95     // Leftovers
96     while (count > 0) {
97         SkPMColor c = *src++;
98         SkPMColorAssert(c);
99         *dst = SkPixel32ToPixel16_ToU16(c);
100         dst++;
101         count--;
102     };
103 }
104 
S32_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
106                           const SkPMColor* SK_RESTRICT src, int count,
107                           U8CPU alpha, int /*x*/, int /*y*/) {
108     SkASSERT(255 > alpha);
109 
110     uint16x8_t vmask_blue, vscale;
111 
112     // prepare constants
113     vscale = vdupq_n_u16(SkAlpha255To256(alpha));
114     vmask_blue = vmovq_n_u16(0x1F);
115 
116     while (count >= 8) {
117         uint8x8x4_t vsrc;
118         uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
119         uint16x8_t vres_r, vres_g, vres_b;
120 
121         // Load src
122 #ifdef SK_CPU_ARM64
123         vsrc = sk_vld4_u8_arm64_3(src);
124 #else
125         {
126         register uint8x8_t d0 asm("d0");
127         register uint8x8_t d1 asm("d1");
128         register uint8x8_t d2 asm("d2");
129         register uint8x8_t d3 asm("d3");
130 
131         asm (
132             "vld4.8    {d0-d3},[%[src]]!"
133             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
134             :
135         );
136         vsrc.val[0] = d0;
137         vsrc.val[1] = d1;
138         vsrc.val[2] = d2;
139         }
140 #endif
141 
142         // Load and unpack dst
143         vdst = vld1q_u16(dst);
144         vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
145         vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
146         vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
147         vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
148 
149         // Shift src to 565 range
150         vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
151         vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
152         vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
153 
154         // Scale src - dst
155         vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
156         vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
157         vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
158 
159         vres_r = vshrq_n_u16(vres_r * vscale, 8);
160         vres_g = vshrq_n_u16(vres_g * vscale, 8);
161         vres_b = vshrq_n_u16(vres_b * vscale, 8);
162 
163         vres_r += vdst_r;
164         vres_g += vdst_g;
165         vres_b += vdst_b;
166 
167         // Combine
168         vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
169         vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
170 
171         // Store
172         vst1q_u16(dst, vres_b);
173         dst += 8;
174         count -= 8;
175     }
176     if (count > 0) {
177         int scale = SkAlpha255To256(alpha);
178         do {
179             SkPMColor c = *src++;
180             SkPMColorAssert(c);
181             uint16_t d = *dst;
182             *dst++ = SkPackRGB16(
183                     SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
184                     SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
185                     SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
186         } while (--count != 0);
187     }
188 }
189 
190 #ifdef SK_CPU_ARM32
S32A_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
192                            const SkPMColor* SK_RESTRICT src, int count,
193                            U8CPU alpha, int /*x*/, int /*y*/) {
194     SkASSERT(255 == alpha);
195 
196     if (count >= 8) {
197         uint16_t* SK_RESTRICT keep_dst = 0;
198 
199         asm volatile (
200                       "ands       ip, %[count], #7            \n\t"
201                       "vmov.u8    d31, #1<<7                  \n\t"
202                       "vld1.16    {q12}, [%[dst]]             \n\t"
203                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
204                       // Thumb does not support the standard ARM conditional
205                       // instructions but instead requires the 'it' instruction
206                       // to signal conditional execution
207                       "it eq                                  \n\t"
208                       "moveq      ip, #8                      \n\t"
209                       "mov        %[keep_dst], %[dst]         \n\t"
210 
211                       "add        %[src], %[src], ip, LSL#2   \n\t"
212                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
213                       "subs       %[count], %[count], ip      \n\t"
214                       "b          9f                          \n\t"
215                       // LOOP
216                       "2:                                         \n\t"
217 
218                       "vld1.16    {q12}, [%[dst]]!            \n\t"
219                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
220                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
221                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
222                       "subs       %[count], %[count], #8      \n\t"
223                       "9:                                         \n\t"
224                       "pld        [%[dst],#32]                \n\t"
225                       // expand 0565 q12 to 8888 {d4-d7}
226                       "vmovn.u16  d4, q12                     \n\t"
227                       "vshr.u16   q11, q12, #5                \n\t"
228                       "vshr.u16   q10, q12, #6+5              \n\t"
229                       "vmovn.u16  d5, q11                     \n\t"
230                       "vmovn.u16  d6, q10                     \n\t"
231                       "vshl.u8    d4, d4, #3                  \n\t"
232                       "vshl.u8    d5, d5, #2                  \n\t"
233                       "vshl.u8    d6, d6, #3                  \n\t"
234 
235                       "vmovl.u8   q14, d31                    \n\t"
236                       "vmovl.u8   q13, d31                    \n\t"
237                       "vmovl.u8   q12, d31                    \n\t"
238 
239                       // duplicate in 4/2/1 & 8pix vsns
240                       "vmvn.8     d30, d3                     \n\t"
241                       "vmlal.u8   q14, d30, d6                \n\t"
242                       "vmlal.u8   q13, d30, d5                \n\t"
243                       "vmlal.u8   q12, d30, d4                \n\t"
244                       "vshr.u16   q8, q14, #5                 \n\t"
245                       "vshr.u16   q9, q13, #6                 \n\t"
246                       "vaddhn.u16 d6, q14, q8                 \n\t"
247                       "vshr.u16   q8, q12, #5                 \n\t"
248                       "vaddhn.u16 d5, q13, q9                 \n\t"
249                       "vaddhn.u16 d4, q12, q8                 \n\t"
250                       // intentionally don't calculate alpha
251                       // result in d4-d6
252 
253             #ifdef SK_PMCOLOR_IS_RGBA
254                       "vqadd.u8   d6, d6, d0                  \n\t"
255                       "vqadd.u8   d5, d5, d1                  \n\t"
256                       "vqadd.u8   d4, d4, d2                  \n\t"
257             #else
258                       "vqadd.u8   d6, d6, d2                  \n\t"
259                       "vqadd.u8   d5, d5, d1                  \n\t"
260                       "vqadd.u8   d4, d4, d0                  \n\t"
261             #endif
262 
263                       // pack 8888 {d4-d6} to 0565 q10
264                       "vshll.u8   q10, d6, #8                 \n\t"
265                       "vshll.u8   q3, d5, #8                  \n\t"
266                       "vshll.u8   q2, d4, #8                  \n\t"
267                       "vsri.u16   q10, q3, #5                 \n\t"
268                       "vsri.u16   q10, q2, #11                \n\t"
269 
270                       "bne        2b                          \n\t"
271 
272                       "1:                                         \n\t"
273                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
274                       : [count] "+r" (count)
275                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
276                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
277                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
278                       "d30","d31"
279                       );
280     }
281     else
282     {   // handle count < 8
283         uint16_t* SK_RESTRICT keep_dst = 0;
284 
285         asm volatile (
286                       "vmov.u8    d31, #1<<7                  \n\t"
287                       "mov        %[keep_dst], %[dst]         \n\t"
288 
289                       "tst        %[count], #4                \n\t"
290                       "beq        14f                         \n\t"
291                       "vld1.16    {d25}, [%[dst]]!            \n\t"
292                       "vld1.32    {q1}, [%[src]]!             \n\t"
293 
294                       "14:                                        \n\t"
295                       "tst        %[count], #2                \n\t"
296                       "beq        12f                         \n\t"
297                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
298                       "vld1.32    {d1}, [%[src]]!             \n\t"
299 
300                       "12:                                        \n\t"
301                       "tst        %[count], #1                \n\t"
302                       "beq        11f                         \n\t"
303                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
304                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
305 
306                       "11:                                        \n\t"
307                       // unzips achieve the same as a vld4 operation
308                       "vuzp.u16   q0, q1                      \n\t"
309                       "vuzp.u8    d0, d1                      \n\t"
310                       "vuzp.u8    d2, d3                      \n\t"
311                       // expand 0565 q12 to 8888 {d4-d7}
312                       "vmovn.u16  d4, q12                     \n\t"
313                       "vshr.u16   q11, q12, #5                \n\t"
314                       "vshr.u16   q10, q12, #6+5              \n\t"
315                       "vmovn.u16  d5, q11                     \n\t"
316                       "vmovn.u16  d6, q10                     \n\t"
317                       "vshl.u8    d4, d4, #3                  \n\t"
318                       "vshl.u8    d5, d5, #2                  \n\t"
319                       "vshl.u8    d6, d6, #3                  \n\t"
320 
321                       "vmovl.u8   q14, d31                    \n\t"
322                       "vmovl.u8   q13, d31                    \n\t"
323                       "vmovl.u8   q12, d31                    \n\t"
324 
325                       // duplicate in 4/2/1 & 8pix vsns
326                       "vmvn.8     d30, d3                     \n\t"
327                       "vmlal.u8   q14, d30, d6                \n\t"
328                       "vmlal.u8   q13, d30, d5                \n\t"
329                       "vmlal.u8   q12, d30, d4                \n\t"
330                       "vshr.u16   q8, q14, #5                 \n\t"
331                       "vshr.u16   q9, q13, #6                 \n\t"
332                       "vaddhn.u16 d6, q14, q8                 \n\t"
333                       "vshr.u16   q8, q12, #5                 \n\t"
334                       "vaddhn.u16 d5, q13, q9                 \n\t"
335                       "vaddhn.u16 d4, q12, q8                 \n\t"
336                       // intentionally don't calculate alpha
337                       // result in d4-d6
338 
339             #ifdef SK_PMCOLOR_IS_RGBA
340                       "vqadd.u8   d6, d6, d0                  \n\t"
341                       "vqadd.u8   d5, d5, d1                  \n\t"
342                       "vqadd.u8   d4, d4, d2                  \n\t"
343             #else
344                       "vqadd.u8   d6, d6, d2                  \n\t"
345                       "vqadd.u8   d5, d5, d1                  \n\t"
346                       "vqadd.u8   d4, d4, d0                  \n\t"
347             #endif
348 
349                       // pack 8888 {d4-d6} to 0565 q10
350                       "vshll.u8   q10, d6, #8                 \n\t"
351                       "vshll.u8   q3, d5, #8                  \n\t"
352                       "vshll.u8   q2, d4, #8                  \n\t"
353                       "vsri.u16   q10, q3, #5                 \n\t"
354                       "vsri.u16   q10, q2, #11                \n\t"
355 
356                       // store
357                       "tst        %[count], #4                \n\t"
358                       "beq        24f                         \n\t"
359                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
360 
361                       "24:                                        \n\t"
362                       "tst        %[count], #2                \n\t"
363                       "beq        22f                         \n\t"
364                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
365 
366                       "22:                                        \n\t"
367                       "tst        %[count], #1                \n\t"
368                       "beq        21f                         \n\t"
369                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
370 
371                       "21:                                        \n\t"
372                       : [count] "+r" (count)
373                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
374                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
375                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
376                       "d30","d31"
377                       );
378     }
379 }
380 
381 #else // #ifdef SK_CPU_ARM32
382 
S32A_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)383 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
384                            const SkPMColor* SK_RESTRICT src, int count,
385                            U8CPU alpha, int /*x*/, int /*y*/) {
386     SkASSERT(255 == alpha);
387 
388     if (count >= 16) {
389         asm (
390             "movi    v4.8h, #0x80                   \t\n"
391 
392             "1:                                     \t\n"
393             "sub     %w[count], %w[count], #16      \t\n"
394             "ld1     {v16.8h-v17.8h}, [%[dst]]      \t\n"
395             "ld4     {v0.16b-v3.16b}, [%[src]], #64 \t\n"
396             "prfm    pldl1keep, [%[src],#512]       \t\n"
397             "prfm    pldl1keep, [%[dst],#256]       \t\n"
398             "ushr    v20.8h, v17.8h, #5             \t\n"
399             "ushr    v31.8h, v16.8h, #5             \t\n"
400             "xtn     v6.8b, v31.8h                  \t\n"
401             "xtn2    v6.16b, v20.8h                 \t\n"
402             "ushr    v20.8h, v17.8h, #11            \t\n"
403             "shl     v19.16b, v6.16b, #2            \t\n"
404             "ushr    v31.8h, v16.8h, #11            \t\n"
405             "xtn     v22.8b, v31.8h                 \t\n"
406             "xtn2    v22.16b, v20.8h                \t\n"
407             "shl     v18.16b, v22.16b, #3           \t\n"
408             "mvn     v3.16b, v3.16b                 \t\n"
409             "xtn     v16.8b, v16.8h                 \t\n"
410             "mov     v7.16b, v4.16b                 \t\n"
411             "xtn2    v16.16b, v17.8h                \t\n"
412             "umlal   v7.8h, v3.8b, v19.8b           \t\n"
413             "shl     v16.16b, v16.16b, #3           \t\n"
414             "mov     v22.16b, v4.16b                \t\n"
415             "ushr    v24.8h, v7.8h, #6              \t\n"
416             "umlal   v22.8h, v3.8b, v18.8b          \t\n"
417             "ushr    v20.8h, v22.8h, #5             \t\n"
418             "addhn   v20.8b, v22.8h, v20.8h         \t\n"
419             "cmp     %w[count], #16                 \t\n"
420             "mov     v6.16b, v4.16b                 \t\n"
421             "mov     v5.16b, v4.16b                 \t\n"
422             "umlal   v6.8h, v3.8b, v16.8b           \t\n"
423             "umlal2  v5.8h, v3.16b, v19.16b         \t\n"
424             "mov     v17.16b, v4.16b                \t\n"
425             "ushr    v19.8h, v6.8h, #5              \t\n"
426             "umlal2  v17.8h, v3.16b, v18.16b        \t\n"
427             "addhn   v7.8b, v7.8h, v24.8h           \t\n"
428             "ushr    v18.8h, v5.8h, #6              \t\n"
429             "ushr    v21.8h, v17.8h, #5             \t\n"
430             "addhn2  v7.16b, v5.8h, v18.8h          \t\n"
431             "addhn2  v20.16b, v17.8h, v21.8h        \t\n"
432             "mov     v22.16b, v4.16b                \t\n"
433             "addhn   v6.8b, v6.8h, v19.8h           \t\n"
434             "umlal2  v22.8h, v3.16b, v16.16b        \t\n"
435             "ushr    v5.8h, v22.8h, #5              \t\n"
436             "addhn2  v6.16b, v22.8h, v5.8h          \t\n"
437             "uqadd   v7.16b, v1.16b, v7.16b         \t\n"
438 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
439             "uqadd   v20.16b, v2.16b, v20.16b       \t\n"
440             "uqadd   v6.16b, v0.16b, v6.16b         \t\n"
441 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
442             "uqadd   v20.16b, v0.16b, v20.16b       \t\n"
443             "uqadd   v6.16b, v2.16b, v6.16b         \t\n"
444 #else
445 #error "This function only supports BGRA and RGBA."
446 #endif
447             "shll    v22.8h, v20.8b, #8             \t\n"
448             "shll    v5.8h, v7.8b, #8               \t\n"
449             "sri     v22.8h, v5.8h, #5              \t\n"
450             "shll    v17.8h, v6.8b, #8              \t\n"
451             "shll2   v23.8h, v20.16b, #8            \t\n"
452             "shll2   v7.8h, v7.16b, #8              \t\n"
453             "sri     v22.8h, v17.8h, #11            \t\n"
454             "sri     v23.8h, v7.8h, #5              \t\n"
455             "shll2   v6.8h, v6.16b, #8              \t\n"
456             "st1     {v22.8h}, [%[dst]], #16        \t\n"
457             "sri     v23.8h, v6.8h, #11             \t\n"
458             "st1     {v23.8h}, [%[dst]], #16        \t\n"
459             "b.ge    1b                             \t\n"
460             : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
461             :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
462                "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
463                "v31"
464         );
465     }
466         // Leftovers
467     if (count > 0) {
468         do {
469             SkPMColor c = *src++;
470             SkPMColorAssert(c);
471             if (c) {
472                 *dst = SkSrcOver32To16(c, *dst);
473             }
474             dst += 1;
475         } while (--count != 0);
476     }
477 }
478 #endif // #ifdef SK_CPU_ARM32
479 
pmcolor_to_expand16(SkPMColor c)480 static uint32_t pmcolor_to_expand16(SkPMColor c) {
481     unsigned r = SkGetPackedR32(c);
482     unsigned g = SkGetPackedG32(c);
483     unsigned b = SkGetPackedB32(c);
484     return (g << 24) | (r << 13) | (b << 2);
485 }
486 
Color32A_D565_neon(uint16_t dst[],SkPMColor src,int count,int x,int y)487 void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) {
488     uint32_t src_expand;
489     unsigned scale;
490     uint16x8_t vmask_blue;
491 
492     if (count <= 0) return;
493     SkASSERT(((size_t)dst & 0x01) == 0);
494 
495     /*
496      * This preamble code is in order to make dst aligned to 8 bytes
497      * in the next mutiple bytes read & write access.
498      */
499     src_expand = pmcolor_to_expand16(src);
500     scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
501 
502 #define DST_ALIGN 8
503 
504     /*
505      * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time.
506      */
507     int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);
508 
509     for (int i = 0; i < preamble_size; i+=2, dst++) {
510         uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
511         *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
512         if (--count == 0)
513             break;
514     }
515 
516     int count16 = 0;
517     count16 = count >> 4;
518     vmask_blue = vmovq_n_u16(SK_B16_MASK);
519 
520     if (count16) {
521         uint16x8_t wide_sr;
522         uint16x8_t wide_sg;
523         uint16x8_t wide_sb;
524         uint16x8_t wide_256_sa;
525 
526         unsigned sr = SkGetPackedR32(src);
527         unsigned sg = SkGetPackedG32(src);
528         unsigned sb = SkGetPackedB32(src);
529         unsigned sa = SkGetPackedA32(src);
530 
531         // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb
532         // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted,
533         //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
534         wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift
535 
536         // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted,
537         //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5)
538         wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift
539 
540         // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted,
541         //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
542         wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift
543 
544         wide_256_sa =
545             vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3
546 
547         while (count16-- > 0) {
548             uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b;
549             uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b;
550             vdst1 = vld1q_u16(dst);
551             dst += 8;
552             vdst2 = vld1q_u16(dst);
553             dst -= 8;    //to store dst again.
554 
555             vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS);                 // shift green to top of lanes
556             vdst1_b = vdst1 & vmask_blue;                              // extract blue
557             vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT);                // extract red
558             vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green
559 
560             vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS);                 // shift green to top of lanes
561             vdst2_b = vdst2 & vmask_blue;                              // extract blue
562             vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT);                // extract red
563             vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green
564 
565             vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r);        // sr + (256-sa) x dr1
566             vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g);        // sg + (256-sa) x dg1
567             vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b);        // sb + (256-sa) x db1
568 
569             vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r);        // sr + (256-sa) x dr2
570             vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g);        // sg + (256-sa) x dg2
571             vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b);        // sb + (256-sa) x db2
572 
573             vdst1_r = vshrq_n_u16(vdst1_r, 5);                         // 5-bit right shift for 5-bit red
574             vdst1_g = vshrq_n_u16(vdst1_g, 5);                         // 5-bit right shift for 6-bit green
575             vdst1_b = vshrq_n_u16(vdst1_b, 5);                         // 5-bit right shift for 5-bit blue
576 
577             vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT);       // insert green into blue
578             vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT);         // insert red into green/blue
579 
580             vdst2_r = vshrq_n_u16(vdst2_r, 5);                         // 5-bit right shift for 5-bit red
581             vdst2_g = vshrq_n_u16(vdst2_g, 5);                         // 5-bit right shift for 6-bit green
582             vdst2_b = vshrq_n_u16(vdst2_b, 5);                         // 5-bit right shift for 5-bit blue
583 
584             vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT);       // insert green into blue
585             vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT);         // insert red into green/blue
586 
587             vst1q_u16(dst, vdst1);
588             dst += 8;
589             vst1q_u16(dst, vdst2);
590             dst += 8;
591         }
592     }
593 
594     count &= 0xF;
595     if (count > 0) {
596         do {
597             uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
598             *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
599             dst += 1;
600         } while (--count != 0);
601     }
602 }
603 
SkDiv255Round_neon8(uint16x8_t prod)604 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
605     prod += vdupq_n_u16(128);
606     prod += vshrq_n_u16(prod, 8);
607     return vshrq_n_u16(prod, 8);
608 }
609 
S32A_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)610 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
611                           const SkPMColor* SK_RESTRICT src, int count,
612                           U8CPU alpha, int /*x*/, int /*y*/) {
613    SkASSERT(255 > alpha);
614 
615     /* This code implements a Neon version of S32A_D565_Blend. The results have
616      * a few mismatches compared to the original code. These mismatches never
617      * exceed 1.
618      */
619 
620     if (count >= 8) {
621         uint16x8_t valpha_max, vmask_blue;
622         uint8x8_t valpha;
623 
624         // prepare constants
625         valpha_max = vmovq_n_u16(255);
626         valpha = vdup_n_u8(alpha);
627         vmask_blue = vmovq_n_u16(SK_B16_MASK);
628 
629         do {
630             uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
631             uint16x8_t vres_a, vres_r, vres_g, vres_b;
632             uint8x8x4_t vsrc;
633 
634             // load pixels
635             vdst = vld1q_u16(dst);
636 #ifdef SK_CPU_ARM64
637             vsrc = sk_vld4_u8_arm64_4(src);
638 #elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
639             asm (
640                 "vld4.u8 %h[vsrc], [%[src]]!"
641                 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
642                 : :
643             );
644 #else
645             register uint8x8_t d0 asm("d0");
646             register uint8x8_t d1 asm("d1");
647             register uint8x8_t d2 asm("d2");
648             register uint8x8_t d3 asm("d3");
649 
650             asm volatile (
651                 "vld4.u8    {d0-d3},[%[src]]!;"
652                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
653                   [src] "+&r" (src)
654                 : :
655             );
656             vsrc.val[0] = d0;
657             vsrc.val[1] = d1;
658             vsrc.val[2] = d2;
659             vsrc.val[3] = d3;
660 #endif
661 
662 
663             // deinterleave dst
664             vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
665             vdst_b = vdst & vmask_blue;                     // extract blue
666             vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
667             vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
668 
669             // shift src to 565
670             vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
671             vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
672             vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
673 
674             // calc src * src_scale
675             vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
676             vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
677             vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
678             vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
679 
680             // prepare dst_scale
681             vres_a = SkDiv255Round_neon8(vres_a);
682             vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
683 
684             // add dst * dst_scale to previous result
685             vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
686             vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
687             vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
688 
689 #ifdef S32A_D565_BLEND_EXACT
690             // It is possible to get exact results with this but it is slow,
691             // even slower than C code in some cases
692             vres_r = SkDiv255Round_neon8(vres_r);
693             vres_g = SkDiv255Round_neon8(vres_g);
694             vres_b = SkDiv255Round_neon8(vres_b);
695 #else
696             vres_r = vrshrq_n_u16(vres_r, 8);
697             vres_g = vrshrq_n_u16(vres_g, 8);
698             vres_b = vrshrq_n_u16(vres_b, 8);
699 #endif
700             // pack result
701             vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
702             vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
703 
704             // store
705             vst1q_u16(dst, vres_b);
706             dst += 8;
707             count -= 8;
708         } while (count >= 8);
709     }
710 
711     // leftovers
712     while (count-- > 0) {
713         SkPMColor sc = *src++;
714         if (sc) {
715             uint16_t dc = *dst;
716             unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
717             unsigned dr = (SkPacked32ToR16(sc) * alpha) + (SkGetPackedR16(dc) * dst_scale);
718             unsigned dg = (SkPacked32ToG16(sc) * alpha) + (SkGetPackedG16(dc) * dst_scale);
719             unsigned db = (SkPacked32ToB16(sc) * alpha) + (SkGetPackedB16(dc) * dst_scale);
720             *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
721         }
722         dst += 1;
723     }
724 }
725 
726 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
727  * each dither value is spaced out into byte lanes, and repeated
728  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
729  * start of each row.
730  */
731 static const uint8_t gDitherMatrix_Neon[48] = {
732     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
733     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
734     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
735     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
736 
737 };
738 
S32_D565_Blend_Dither_neon(uint16_t * dst,const SkPMColor * src,int count,U8CPU alpha,int x,int y)739 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
740                                 int count, U8CPU alpha, int x, int y)
741 {
742 
743     SkASSERT(255 > alpha);
744 
745     // rescale alpha to range 1 - 256
746     int scale = SkAlpha255To256(alpha);
747 
748     if (count >= 8) {
749         /* select row and offset for dither array */
750         const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
751 
752         uint8x8_t vdither = vld1_u8(dstart);         // load dither values
753         uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
754 
755         int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
756         uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
757 
758         do {
759 
760             uint8x8x4_t vsrc;
761             uint8x8_t vsrc_r, vsrc_g, vsrc_b;
762             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
763             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
764             uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
765             uint16x8_t vdst;
766             uint16x8_t vdst_r, vdst_g, vdst_b;
767             int16x8_t vres_r, vres_g, vres_b;
768             int8x8_t vres8_r, vres8_g, vres8_b;
769 
770             // Load source and add dither
771 #ifdef SK_CPU_ARM64
772             vsrc = sk_vld4_u8_arm64_3(src);
773 #else
774             {
775             register uint8x8_t d0 asm("d0");
776             register uint8x8_t d1 asm("d1");
777             register uint8x8_t d2 asm("d2");
778             register uint8x8_t d3 asm("d3");
779 
780             asm (
781                 "vld4.8    {d0-d3},[%[src]]! "
782                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
783                 :
784             );
785             vsrc.val[0] = d0;
786             vsrc.val[1] = d1;
787             vsrc.val[2] = d2;
788             }
789 #endif
790             vsrc_r = vsrc.val[NEON_R];
791             vsrc_g = vsrc.val[NEON_G];
792             vsrc_b = vsrc.val[NEON_B];
793 
794             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
795             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
796             vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
797 
798             vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
799             vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
800             vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
801 
802             vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
803             vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
804             vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
805 
806             vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
807             vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
808             vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
809 
810             // Load dst and unpack
811             vdst = vld1q_u16(dst);
812             vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
813             vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
814             vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
815 
816             // subtract dst from src and widen
817             vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
818             vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
819             vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
820 
821             // multiply diffs by scale and shift
822             vres_r = vmulq_s16(vres_r, vscale);
823             vres_g = vmulq_s16(vres_g, vscale);
824             vres_b = vmulq_s16(vres_b, vscale);
825 
826             vres8_r = vshrn_n_s16(vres_r, 8);
827             vres8_g = vshrn_n_s16(vres_g, 8);
828             vres8_b = vshrn_n_s16(vres_b, 8);
829 
830             // add dst to result
831             vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
832             vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
833             vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
834 
835             // put result into 565 format
836             vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
837             vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
838 
839             // Store result
840             vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
841 
842             // Next iteration
843             dst += 8;
844             count -= 8;
845 
846         } while (count >= 8);
847     }
848 
849     // Leftovers
850     if (count > 0) {
851         int scale = SkAlpha255To256(alpha);
852         DITHER_565_SCAN(y);
853         do {
854             SkPMColor c = *src++;
855             SkPMColorAssert(c);
856 
857             int dither = DITHER_VALUE(x);
858             int sr = SkGetPackedR32(c);
859             int sg = SkGetPackedG32(c);
860             int sb = SkGetPackedB32(c);
861             sr = SkDITHER_R32To565(sr, dither);
862             sg = SkDITHER_G32To565(sg, dither);
863             sb = SkDITHER_B32To565(sb, dither);
864 
865             uint16_t d = *dst;
866             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
867                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale),
868                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale));
869             DITHER_INC_X(x);
870         } while (--count != 0);
871     }
872 }
873 
874 /* Neon version of S32_Blend_BlitRow32()
875  * portable version is in src/core/SkBlitRow_D32.cpp
876  */
S32_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)877 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
878                               const SkPMColor* SK_RESTRICT src,
879                               int count, U8CPU alpha) {
880     SkASSERT(alpha <= 255);
881 
882     if (count <= 0) {
883         return;
884     }
885 
886     uint16_t src_scale = SkAlpha255To256(alpha);
887     uint16_t dst_scale = 256 - src_scale;
888 
889     while (count >= 2) {
890         uint8x8_t vsrc, vdst, vres;
891         uint16x8_t vsrc_wide, vdst_wide;
892 
893         /* These commented prefetches are a big win for count
894          * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
895          * They also hurt a little (<5%) on an A15
896          */
897         //__builtin_prefetch(src+32);
898         //__builtin_prefetch(dst+32);
899 
900         // Load
901         vsrc = vreinterpret_u8_u32(vld1_u32(src));
902         vdst = vreinterpret_u8_u32(vld1_u32(dst));
903 
904         // Process src
905         vsrc_wide = vmovl_u8(vsrc);
906         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
907 
908         // Process dst
909         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
910 
911         // Combine
912         vdst_wide += vsrc_wide;
913         vres = vshrn_n_u16(vdst_wide, 8);
914 
915         // Store
916         vst1_u32(dst, vreinterpret_u32_u8(vres));
917 
918         src += 2;
919         dst += 2;
920         count -= 2;
921     }
922 
923     if (count == 1) {
924         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
925         uint16x8_t vsrc_wide, vdst_wide;
926 
927         // Load
928         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
929         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
930 
931         // Process
932         vsrc_wide = vmovl_u8(vsrc);
933         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
934         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
935         vdst_wide += vsrc_wide;
936         vres = vshrn_n_u16(vdst_wide, 8);
937 
938         // Store
939         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
940     }
941 }
942 
943 #ifdef SK_CPU_ARM32
S32A_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)944 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
945                          const SkPMColor* SK_RESTRICT src,
946                          int count, U8CPU alpha) {
947 
948     SkASSERT(255 > alpha);
949 
950     if (count <= 0) {
951         return;
952     }
953 
954     unsigned alpha256 = SkAlpha255To256(alpha);
955 
956     // First deal with odd counts
957     if (count & 1) {
958         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
959         uint16x8_t vdst_wide, vsrc_wide;
960         unsigned dst_scale;
961 
962         // Load
963         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
964         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
965 
966         // Calc dst_scale
967         dst_scale = vget_lane_u8(vsrc, 3);
968         dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
969 
970         // Process src
971         vsrc_wide = vmovl_u8(vsrc);
972         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
973 
974         // Process dst
975         vdst_wide = vmovl_u8(vdst);
976         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
977 
978         // Combine
979         vdst_wide += vsrc_wide;
980         vres = vshrn_n_u16(vdst_wide, 8);
981 
982         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
983         dst++;
984         src++;
985         count--;
986     }
987 
988     if (count) {
989         uint8x8_t alpha_mask;
990         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
991         alpha_mask = vld1_u8(alpha_mask_setup);
992 
993         do {
994 
995             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
996             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
997 
998             __builtin_prefetch(src+32);
999             __builtin_prefetch(dst+32);
1000 
1001             // Load
1002             vsrc = vreinterpret_u8_u32(vld1_u32(src));
1003             vdst = vreinterpret_u8_u32(vld1_u32(dst));
1004 
1005             // Prepare src_scale
1006             vsrc_scale = vdupq_n_u16(alpha256);
1007 
1008             // Calc dst_scale
1009             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
1010             vdst_scale = vmovl_u8(vsrc_alphas);
1011             // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
1012             // A 16-bit lane would overflow if we used 0xFFFF here,
1013             // so use an approximation with 0xFF00 that is off by 1,
1014             // and add back 1 after to get the correct value.
1015             // This is valid if alpha256 <= 255.
1016             vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
1017             vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
1018             vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
1019 
1020             // Process src
1021             vsrc_wide = vmovl_u8(vsrc);
1022             vsrc_wide *= vsrc_scale;
1023 
1024             // Process dst
1025             vdst_wide = vmovl_u8(vdst);
1026             vdst_wide *= vdst_scale;
1027 
1028             // Combine
1029             vdst_wide += vsrc_wide;
1030             vres = vshrn_n_u16(vdst_wide, 8);
1031 
1032             vst1_u32(dst, vreinterpret_u32_u8(vres));
1033 
1034             src += 2;
1035             dst += 2;
1036             count -= 2;
1037         } while(count);
1038     }
1039 }
1040 
1041 ///////////////////////////////////////////////////////////////////////////////
1042 
1043 #endif // #ifdef SK_CPU_ARM32
1044 
S32A_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1045 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1046                                    const SkPMColor* SK_RESTRICT src,
1047                                    int count, U8CPU alpha, int x, int y) {
1048     SkASSERT(255 == alpha);
1049 
1050 #define    UNROLL    8
1051 
1052     if (count >= UNROLL) {
1053 
1054     uint8x8_t dbase;
1055     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1056     dbase = vld1_u8(dstart);
1057 
1058         do {
1059         uint8x8x4_t vsrc;
1060         uint8x8_t sr, sg, sb, sa, d;
1061         uint16x8_t dst8, scale8, alpha8;
1062         uint16x8_t dst_r, dst_g, dst_b;
1063 
1064 #ifdef SK_CPU_ARM64
1065         vsrc = sk_vld4_u8_arm64_4(src);
1066 #else
1067         {
1068         register uint8x8_t d0 asm("d0");
1069         register uint8x8_t d1 asm("d1");
1070         register uint8x8_t d2 asm("d2");
1071         register uint8x8_t d3 asm("d3");
1072 
1073         asm ("vld4.8    {d0-d3},[%[src]]! "
1074             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1075             :
1076         );
1077         vsrc.val[0] = d0;
1078         vsrc.val[1] = d1;
1079         vsrc.val[2] = d2;
1080         vsrc.val[3] = d3;
1081         }
1082 #endif
1083         sa = vsrc.val[NEON_A];
1084         sr = vsrc.val[NEON_R];
1085         sg = vsrc.val[NEON_G];
1086         sb = vsrc.val[NEON_B];
1087 
1088         /* calculate 'd', which will be 0..7
1089          * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1090          */
1091         alpha8 = vmovl_u8(dbase);
1092         alpha8 = vmlal_u8(alpha8, sa, dbase);
1093         d = vshrn_n_u16(alpha8, 8);    // narrowing too
1094 
1095         // sr = sr - (sr>>5) + d
1096         /* watching for 8-bit overflow.  d is 0..7; risky range of
1097          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1098          * safe  as long as we do ((sr-sr>>5) + d)
1099          */
1100         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1101         sr = vadd_u8(sr, d);
1102 
1103         // sb = sb - (sb>>5) + d
1104         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1105         sb = vadd_u8(sb, d);
1106 
1107         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1108         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1109         sg = vadd_u8(sg, vshr_n_u8(d,1));
1110 
1111         // need to pick up 8 dst's -- at 16 bits each, 128 bits
1112         dst8 = vld1q_u16(dst);
1113         dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1114         dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1115         dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
1116 
1117         // blend
1118         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1119 
1120         // combine the addq and mul, save 3 insns
1121         scale8 = vshrq_n_u16(scale8, 3);
1122         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1123         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1124         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1125 
1126         // repack to store
1127         dst8 = vshrq_n_u16(dst_b, 5);
1128         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1129         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1130 
1131         vst1q_u16(dst, dst8);
1132 
1133         dst += UNROLL;
1134         count -= UNROLL;
1135         // skip x += UNROLL, since it's unchanged mod-4
1136         } while (count >= UNROLL);
1137     }
1138 #undef    UNROLL
1139 
1140     // residuals
1141     if (count > 0) {
1142         DITHER_565_SCAN(y);
1143         do {
1144             SkPMColor c = *src++;
1145             SkPMColorAssert(c);
1146             if (c) {
1147                 unsigned a = SkGetPackedA32(c);
1148 
1149                 // dither and alpha are just temporary variables to work-around
1150                 // an ICE in debug.
1151                 unsigned dither = DITHER_VALUE(x);
1152                 unsigned alpha = SkAlpha255To256(a);
1153                 int d = SkAlphaMul(dither, alpha);
1154 
1155                 unsigned sr = SkGetPackedR32(c);
1156                 unsigned sg = SkGetPackedG32(c);
1157                 unsigned sb = SkGetPackedB32(c);
1158                 sr = SkDITHER_R32_FOR_565(sr, d);
1159                 sg = SkDITHER_G32_FOR_565(sg, d);
1160                 sb = SkDITHER_B32_FOR_565(sb, d);
1161 
1162                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1163                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1164                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1165                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1166                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1167             }
1168             dst += 1;
1169             DITHER_INC_X(x);
1170         } while (--count != 0);
1171     }
1172 }
1173 
1174 ///////////////////////////////////////////////////////////////////////////////
1175 
S32_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1176 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1177                                  const SkPMColor* SK_RESTRICT src,
1178                                  int count, U8CPU alpha, int x, int y) {
1179     SkASSERT(255 == alpha);
1180 
1181 #define    UNROLL    8
1182     if (count >= UNROLL) {
1183     uint8x8_t d;
1184     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1185     d = vld1_u8(dstart);
1186 
1187     while (count >= UNROLL) {
1188         uint8x8_t sr, sg, sb;
1189         uint16x8_t dr, dg, db;
1190         uint16x8_t dst8;
1191         uint8x8x4_t vsrc;
1192 
1193 #ifdef SK_CPU_ARM64
1194         vsrc = sk_vld4_u8_arm64_3(src);
1195 #else
1196         {
1197         register uint8x8_t d0 asm("d0");
1198         register uint8x8_t d1 asm("d1");
1199         register uint8x8_t d2 asm("d2");
1200         register uint8x8_t d3 asm("d3");
1201 
1202         asm (
1203             "vld4.8    {d0-d3},[%[src]]! "
1204             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1205             :
1206         );
1207         vsrc.val[0] = d0;
1208         vsrc.val[1] = d1;
1209         vsrc.val[2] = d2;
1210         }
1211 #endif
1212         sr = vsrc.val[NEON_R];
1213         sg = vsrc.val[NEON_G];
1214         sb = vsrc.val[NEON_B];
1215 
1216         /* XXX: if we want to prefetch, hide it in the above asm()
1217          * using the gcc __builtin_prefetch(), the prefetch will
1218          * fall to the bottom of the loop -- it won't stick up
1219          * at the top of the loop, just after the vld4.
1220          */
1221 
1222         // sr = sr - (sr>>5) + d
1223         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1224         dr = vaddl_u8(sr, d);
1225 
1226         // sb = sb - (sb>>5) + d
1227         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1228         db = vaddl_u8(sb, d);
1229 
1230         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1231         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1232         dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1233 
1234         // pack high bits of each into 565 format  (rgb, b is lsb)
1235         dst8 = vshrq_n_u16(db, 3);
1236         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1237         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1238 
1239         // store it
1240         vst1q_u16(dst, dst8);
1241 
1242         dst += UNROLL;
1243         // we don't need to increment src as the asm above has already done it
1244         count -= UNROLL;
1245         x += UNROLL;        // probably superfluous
1246     }
1247     }
1248 #undef    UNROLL
1249 
1250     // residuals
1251     if (count > 0) {
1252         DITHER_565_SCAN(y);
1253         do {
1254             SkPMColor c = *src++;
1255             SkPMColorAssert(c);
1256             SkASSERT(SkGetPackedA32(c) == 255);
1257 
1258             unsigned dither = DITHER_VALUE(x);
1259             *dst++ = SkDitherRGB32To565(c, dither);
1260             DITHER_INC_X(x);
1261         } while (--count != 0);
1262     }
1263 }
1264 
1265 ///////////////////////////////////////////////////////////////////////////////
1266 
1267 const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = {
1268     // no dither
1269     S32_D565_Opaque_neon,
1270     S32_D565_Blend_neon,
1271     S32A_D565_Opaque_neon,
1272 #if 0
1273     S32A_D565_Blend_neon,
1274 #else
1275     nullptr,   // https://code.google.com/p/skia/issues/detail?id=2797
1276 #endif
1277 
1278     // dither
1279     S32_D565_Opaque_Dither_neon,
1280     S32_D565_Blend_Dither_neon,
1281     S32A_D565_Opaque_Dither_neon,
1282     nullptr,   // S32A_D565_Blend_Dither
1283 };
1284 
1285 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
1286     Color32A_D565_neon,    // Color32_D565,
1287     Color32A_D565_neon,    // Color32A_D565,
1288     Color32A_D565_neon,    // Color32_D565_Dither,
1289     Color32A_D565_neon,    // Color32A_D565_Dither
1290 };
1291 
1292 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1293     nullptr,   // S32_Opaque,
1294     S32_Blend_BlitRow32_neon,        // S32_Blend,
1295     nullptr,  // Ported to SkOpts
1296 #ifdef SK_CPU_ARM32
1297     S32A_Blend_BlitRow32_neon        // S32A_Blend
1298 #else
1299     nullptr
1300 #endif
1301 };
1302