• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "SkBlitRow_opts_arm_neon.h"
9 
10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h"
12 #include "SkColorPriv.h"
13 #include "SkDither.h"
14 #include "SkMathPriv.h"
15 #include "SkUtils.h"
16 
17 #include "SkCachePreload_arm.h"
18 #include "SkColor_opts_neon.h"
19 #include <arm_neon.h>
20 
S32_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
22                            const SkPMColor* SK_RESTRICT src, int count,
23                            U8CPU alpha, int /*x*/, int /*y*/) {
24     SkASSERT(255 == alpha);
25 
26     while (count >= 8) {
27         uint8x8x4_t vsrc;
28         uint16x8_t vdst;
29 
30         // Load
31         vsrc = vld4_u8((uint8_t*)src);
32 
33         // Convert src to 565
34         vdst = SkPixel32ToPixel16_neon8(vsrc);
35 
36         // Store
37         vst1q_u16(dst, vdst);
38 
39         // Prepare next iteration
40         dst += 8;
41         src += 8;
42         count -= 8;
43     };
44 
45     // Leftovers
46     while (count > 0) {
47         SkPMColor c = *src++;
48         SkPMColorAssert(c);
49         *dst = SkPixel32ToPixel16_ToU16(c);
50         dst++;
51         count--;
52     };
53 }
54 
S32A_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)55 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
56                            const SkPMColor* SK_RESTRICT src, int count,
57                            U8CPU alpha, int /*x*/, int /*y*/) {
58     SkASSERT(255 == alpha);
59 
60     if (count >= 8) {
61         uint16_t* SK_RESTRICT keep_dst = 0;
62 
63         asm volatile (
64                       "ands       ip, %[count], #7            \n\t"
65                       "vmov.u8    d31, #1<<7                  \n\t"
66                       "vld1.16    {q12}, [%[dst]]             \n\t"
67                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
68                       // Thumb does not support the standard ARM conditional
69                       // instructions but instead requires the 'it' instruction
70                       // to signal conditional execution
71                       "it eq                                  \n\t"
72                       "moveq      ip, #8                      \n\t"
73                       "mov        %[keep_dst], %[dst]         \n\t"
74 
75                       "add        %[src], %[src], ip, LSL#2   \n\t"
76                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
77                       "subs       %[count], %[count], ip      \n\t"
78                       "b          9f                          \n\t"
79                       // LOOP
80                       "2:                                         \n\t"
81 
82                       "vld1.16    {q12}, [%[dst]]!            \n\t"
83                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
84                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
85                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
86                       "subs       %[count], %[count], #8      \n\t"
87                       "9:                                         \n\t"
88                       "pld        [%[dst],#32]                \n\t"
89                       // expand 0565 q12 to 8888 {d4-d7}
90                       "vmovn.u16  d4, q12                     \n\t"
91                       "vshr.u16   q11, q12, #5                \n\t"
92                       "vshr.u16   q10, q12, #6+5              \n\t"
93                       "vmovn.u16  d5, q11                     \n\t"
94                       "vmovn.u16  d6, q10                     \n\t"
95                       "vshl.u8    d4, d4, #3                  \n\t"
96                       "vshl.u8    d5, d5, #2                  \n\t"
97                       "vshl.u8    d6, d6, #3                  \n\t"
98 
99                       "vmovl.u8   q14, d31                    \n\t"
100                       "vmovl.u8   q13, d31                    \n\t"
101                       "vmovl.u8   q12, d31                    \n\t"
102 
103                       // duplicate in 4/2/1 & 8pix vsns
104                       "vmvn.8     d30, d3                     \n\t"
105                       "vmlal.u8   q14, d30, d6                \n\t"
106                       "vmlal.u8   q13, d30, d5                \n\t"
107                       "vmlal.u8   q12, d30, d4                \n\t"
108                       "vshr.u16   q8, q14, #5                 \n\t"
109                       "vshr.u16   q9, q13, #6                 \n\t"
110                       "vaddhn.u16 d6, q14, q8                 \n\t"
111                       "vshr.u16   q8, q12, #5                 \n\t"
112                       "vaddhn.u16 d5, q13, q9                 \n\t"
113                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
114                       "vaddhn.u16 d4, q12, q8                 \n\t"
115                       // intentionally don't calculate alpha
116                       // result in d4-d6
117 
118                       "vqadd.u8   d5, d5, d1                  \n\t"
119                       "vqadd.u8   d4, d4, d2                  \n\t"
120 
121                       // pack 8888 {d4-d6} to 0565 q10
122                       "vshll.u8   q10, d6, #8                 \n\t"
123                       "vshll.u8   q3, d5, #8                  \n\t"
124                       "vshll.u8   q2, d4, #8                  \n\t"
125                       "vsri.u16   q10, q3, #5                 \n\t"
126                       "vsri.u16   q10, q2, #11                \n\t"
127 
128                       "bne        2b                          \n\t"
129 
130                       "1:                                         \n\t"
131                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
132                       : [count] "+r" (count)
133                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
134                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
135                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
136                       "d30","d31"
137                       );
138     }
139     else
140     {   // handle count < 8
141         uint16_t* SK_RESTRICT keep_dst = 0;
142 
143         asm volatile (
144                       "vmov.u8    d31, #1<<7                  \n\t"
145                       "mov        %[keep_dst], %[dst]         \n\t"
146 
147                       "tst        %[count], #4                \n\t"
148                       "beq        14f                         \n\t"
149                       "vld1.16    {d25}, [%[dst]]!            \n\t"
150                       "vld1.32    {q1}, [%[src]]!             \n\t"
151 
152                       "14:                                        \n\t"
153                       "tst        %[count], #2                \n\t"
154                       "beq        12f                         \n\t"
155                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
156                       "vld1.32    {d1}, [%[src]]!             \n\t"
157 
158                       "12:                                        \n\t"
159                       "tst        %[count], #1                \n\t"
160                       "beq        11f                         \n\t"
161                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
162                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
163 
164                       "11:                                        \n\t"
165                       // unzips achieve the same as a vld4 operation
166                       "vuzpq.u16  q0, q1                      \n\t"
167                       "vuzp.u8    d0, d1                      \n\t"
168                       "vuzp.u8    d2, d3                      \n\t"
169                       // expand 0565 q12 to 8888 {d4-d7}
170                       "vmovn.u16  d4, q12                     \n\t"
171                       "vshr.u16   q11, q12, #5                \n\t"
172                       "vshr.u16   q10, q12, #6+5              \n\t"
173                       "vmovn.u16  d5, q11                     \n\t"
174                       "vmovn.u16  d6, q10                     \n\t"
175                       "vshl.u8    d4, d4, #3                  \n\t"
176                       "vshl.u8    d5, d5, #2                  \n\t"
177                       "vshl.u8    d6, d6, #3                  \n\t"
178 
179                       "vmovl.u8   q14, d31                    \n\t"
180                       "vmovl.u8   q13, d31                    \n\t"
181                       "vmovl.u8   q12, d31                    \n\t"
182 
183                       // duplicate in 4/2/1 & 8pix vsns
184                       "vmvn.8     d30, d3                     \n\t"
185                       "vmlal.u8   q14, d30, d6                \n\t"
186                       "vmlal.u8   q13, d30, d5                \n\t"
187                       "vmlal.u8   q12, d30, d4                \n\t"
188                       "vshr.u16   q8, q14, #5                 \n\t"
189                       "vshr.u16   q9, q13, #6                 \n\t"
190                       "vaddhn.u16 d6, q14, q8                 \n\t"
191                       "vshr.u16   q8, q12, #5                 \n\t"
192                       "vaddhn.u16 d5, q13, q9                 \n\t"
193                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
194                       "vaddhn.u16 d4, q12, q8                 \n\t"
195                       // intentionally don't calculate alpha
196                       // result in d4-d6
197 
198                       "vqadd.u8   d5, d5, d1                  \n\t"
199                       "vqadd.u8   d4, d4, d2                  \n\t"
200 
201                       // pack 8888 {d4-d6} to 0565 q10
202                       "vshll.u8   q10, d6, #8                 \n\t"
203                       "vshll.u8   q3, d5, #8                  \n\t"
204                       "vshll.u8   q2, d4, #8                  \n\t"
205                       "vsri.u16   q10, q3, #5                 \n\t"
206                       "vsri.u16   q10, q2, #11                \n\t"
207 
208                       // store
209                       "tst        %[count], #4                \n\t"
210                       "beq        24f                         \n\t"
211                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
212 
213                       "24:                                        \n\t"
214                       "tst        %[count], #2                \n\t"
215                       "beq        22f                         \n\t"
216                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
217 
218                       "22:                                        \n\t"
219                       "tst        %[count], #1                \n\t"
220                       "beq        21f                         \n\t"
221                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
222 
223                       "21:                                        \n\t"
224                       : [count] "+r" (count)
225                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
226                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
227                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
228                       "d30","d31"
229                       );
230     }
231 }
232 
S32A_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)233 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
234                           const SkPMColor* SK_RESTRICT src, int count,
235                           U8CPU alpha, int /*x*/, int /*y*/) {
236 
237     U8CPU alpha_for_asm = alpha;
238 
239     asm volatile (
240     /* This code implements a Neon version of S32A_D565_Blend. The output differs from
241      * the original in two respects:
242      *  1. The results have a few mismatches compared to the original code. These mismatches
243      *     never exceed 1. It's possible to improve accuracy vs. a floating point
244      *     implementation by introducing rounding right shifts (vrshr) for the final stage.
245      *     Rounding is not present in the code below, because although results would be closer
246      *     to a floating point implementation, the number of mismatches compared to the
247      *     original code would be far greater.
248      *  2. On certain inputs, the original code can overflow, causing colour channels to
249      *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
250      *     to affect another.
251      */
252 
253 #if 1
254         /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
255                   "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
256 #else
257                   "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
258 #endif
259                   "vmov.u16   q3, #255                        \n\t"   // set up constant
260                   "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
261                   "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
262                   "beq        2f                              \n\t"   // if count8 == 0, exit
263                   "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
264 
265                   "1:                                             \n\t"
266                   "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
267                   "subs       r4, r4, #1                      \n\t"   // decrement loop counter
268                   "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
269                   //  and deinterleave
270 
271                   "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
272                   "vand       q10, q0, q15                    \n\t"   // extract blue
273                   "vshr.u16   q8, q0, #11                     \n\t"   // extract red
274                   "vshr.u16   q9, q9, #10                     \n\t"   // extract green
275                   // dstrgb = {q8, q9, q10}
276 
277                   "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
278                   "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
279                   "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
280 
281                   "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
282                   "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
283                   "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
284                   "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
285                   // srcrgba = {q11, q12, q13, q14}
286 
287                   "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
288                   "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
289                   "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
290                   "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
291 
292                   "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
293                   "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
294                   // dst_scale = q2
295 
296                   "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
297                   "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
298                   "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
299 
300 #if 1
301     // trying for a better match with SkDiv255Round(a)
302     // C alg is:  a+=128; (a+a>>8)>>8
303     // we'll use just a rounding shift [q2 is available for scratch]
304                   "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
305                   "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
306                   "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
307 #else
308     // arm's original "truncating divide by 256"
309                   "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
310                   "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
311                   "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
312 #endif
313 
314                   "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
315                   "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
316                   "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
317 
318                   "bne        1b                              \n\t"   // if counter != 0, loop
319                   "2:                                             \n\t"   // exit
320 
321                   : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
322                   :
323                   : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
324                   );
325 
326     count &= 7;
327     if (count > 0) {
328         do {
329             SkPMColor sc = *src++;
330             if (sc) {
331                 uint16_t dc = *dst;
332                 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
333                 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
334                 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
335                 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
336                 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
337             }
338             dst += 1;
339         } while (--count != 0);
340     }
341 }
342 
343 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
344  * each dither value is spaced out into byte lanes, and repeated
345  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
346  * start of each row.
347  */
348 static const uint8_t gDitherMatrix_Neon[48] = {
349     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
350     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
351     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
352     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
353 
354 };
355 
S32_D565_Blend_Dither_neon(uint16_t * dst,const SkPMColor * src,int count,U8CPU alpha,int x,int y)356 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
357                                 int count, U8CPU alpha, int x, int y)
358 {
359 
360     SkASSERT(255 > alpha);
361 
362     // rescale alpha to range 1 - 256
363     int scale = SkAlpha255To256(alpha);
364 
365     if (count >= 8) {
366         /* select row and offset for dither array */
367         const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
368 
369         uint8x8_t vdither = vld1_u8(dstart);         // load dither values
370         uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
371 
372         int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
373         uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
374 
375         do {
376 
377             uint8x8_t vsrc_r, vsrc_g, vsrc_b;
378             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
379             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
380             uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
381             uint16x8_t vdst;
382             uint16x8_t vdst_r, vdst_g, vdst_b;
383             int16x8_t vres_r, vres_g, vres_b;
384             int8x8_t vres8_r, vres8_g, vres8_b;
385 
386             // Load source and add dither
387             {
388             register uint8x8_t d0 asm("d0");
389             register uint8x8_t d1 asm("d1");
390             register uint8x8_t d2 asm("d2");
391             register uint8x8_t d3 asm("d3");
392 
393             asm (
394                 "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
395                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
396                 :
397             );
398             vsrc_g = d1;
399 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
400             vsrc_r = d2; vsrc_b = d0;
401 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
402             vsrc_r = d0; vsrc_b = d2;
403 #endif
404             }
405 
406             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
407             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
408             vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
409 
410             vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
411             vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
412             vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
413 
414             vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
415             vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
416             vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
417 
418             vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
419             vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
420             vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
421 
422             // Load dst and unpack
423             vdst = vld1q_u16(dst);
424             vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
425             vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
426             vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
427 
428             // subtract dst from src and widen
429             vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
430             vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
431             vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
432 
433             // multiply diffs by scale and shift
434             vres_r = vmulq_s16(vres_r, vscale);
435             vres_g = vmulq_s16(vres_g, vscale);
436             vres_b = vmulq_s16(vres_b, vscale);
437 
438             vres8_r = vshrn_n_s16(vres_r, 8);
439             vres8_g = vshrn_n_s16(vres_g, 8);
440             vres8_b = vshrn_n_s16(vres_b, 8);
441 
442             // add dst to result
443             vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
444             vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
445             vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
446 
447             // put result into 565 format
448             vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
449             vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
450 
451             // Store result
452             vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
453 
454             // Next iteration
455             dst += 8;
456             count -= 8;
457 
458         } while (count >= 8);
459     }
460 
461     // Leftovers
462     if (count > 0) {
463         int scale = SkAlpha255To256(alpha);
464         DITHER_565_SCAN(y);
465         do {
466             SkPMColor c = *src++;
467             SkPMColorAssert(c);
468 
469             int dither = DITHER_VALUE(x);
470             int sr = SkGetPackedR32(c);
471             int sg = SkGetPackedG32(c);
472             int sb = SkGetPackedB32(c);
473             sr = SkDITHER_R32To565(sr, dither);
474             sg = SkDITHER_G32To565(sg, dither);
475             sb = SkDITHER_B32To565(sb, dither);
476 
477             uint16_t d = *dst;
478             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
479                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale),
480                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale));
481             DITHER_INC_X(x);
482         } while (--count != 0);
483     }
484 }
485 
S32A_Opaque_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)486 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
487                                 const SkPMColor* SK_RESTRICT src,
488                                 int count, U8CPU alpha) {
489 
490     SkASSERT(255 == alpha);
491     if (count > 0) {
492 
493 
494     uint8x8_t alpha_mask;
495 
496     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
497     alpha_mask = vld1_u8(alpha_mask_setup);
498 
499     /* do the NEON unrolled code */
500 #define    UNROLL    4
501     while (count >= UNROLL) {
502         uint8x8_t src_raw, dst_raw, dst_final;
503         uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
504 
505         /* The two prefetches below may make the code slighlty
506          * slower for small values of count but are worth having
507          * in the general case.
508          */
509         __builtin_prefetch(src+32);
510         __builtin_prefetch(dst+32);
511 
512         /* get the source */
513         src_raw = vreinterpret_u8_u32(vld1_u32(src));
514 #if    UNROLL > 2
515         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
516 #endif
517 
518         /* get and hold the dst too */
519         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
520 #if    UNROLL > 2
521         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
522 #endif
523 
524     /* 1st and 2nd bits of the unrolling */
525     {
526         uint8x8_t dst_cooked;
527         uint16x8_t dst_wide;
528         uint8x8_t alpha_narrow;
529         uint16x8_t alpha_wide;
530 
531         /* get the alphas spread out properly */
532         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
533         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
534 
535         /* spread the dest */
536         dst_wide = vmovl_u8(dst_raw);
537 
538         /* alpha mul the dest */
539         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
540         dst_cooked = vshrn_n_u16(dst_wide, 8);
541 
542         /* sum -- ignoring any byte lane overflows */
543         dst_final = vadd_u8(src_raw, dst_cooked);
544     }
545 
546 #if    UNROLL > 2
547     /* the 3rd and 4th bits of our unrolling */
548     {
549         uint8x8_t dst_cooked;
550         uint16x8_t dst_wide;
551         uint8x8_t alpha_narrow;
552         uint16x8_t alpha_wide;
553 
554         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
555         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
556 
557         /* spread the dest */
558         dst_wide = vmovl_u8(dst_raw_2);
559 
560         /* alpha mul the dest */
561         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
562         dst_cooked = vshrn_n_u16(dst_wide, 8);
563 
564         /* sum -- ignoring any byte lane overflows */
565         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
566     }
567 #endif
568 
569         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
570 #if    UNROLL > 2
571         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
572 #endif
573 
574         src += UNROLL;
575         dst += UNROLL;
576         count -= UNROLL;
577     }
578 #undef    UNROLL
579 
580     /* do any residual iterations */
581         while (--count >= 0) {
582             *dst = SkPMSrcOver(*src, *dst);
583             src += 1;
584             dst += 1;
585         }
586     }
587 }
588 
S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)589 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
590                                 const SkPMColor* SK_RESTRICT src,
591                                 int count, U8CPU alpha) {
592     SkASSERT(255 == alpha);
593 
594     if (count <= 0)
595     return;
596 
597     /* Use these to check if src is transparent or opaque */
598     const unsigned int ALPHA_OPAQ  = 0xFF000000;
599     const unsigned int ALPHA_TRANS = 0x00FFFFFF;
600 
601 #define UNROLL  4
602     const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
603     const SkPMColor* SK_RESTRICT src_temp = src;
604 
605     /* set up the NEON variables */
606     uint8x8_t alpha_mask;
607     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
608     alpha_mask = vld1_u8(alpha_mask_setup);
609 
610     uint8x8_t src_raw, dst_raw, dst_final;
611     uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
612     uint8x8_t dst_cooked;
613     uint16x8_t dst_wide;
614     uint8x8_t alpha_narrow;
615     uint16x8_t alpha_wide;
616 
617     /* choose the first processing type */
618     if( src >= src_end)
619         goto TAIL;
620     if(*src <= ALPHA_TRANS)
621         goto ALPHA_0;
622     if(*src >= ALPHA_OPAQ)
623         goto ALPHA_255;
624     /* fall-thru */
625 
626 ALPHA_1_TO_254:
627     do {
628 
629         /* get the source */
630         src_raw = vreinterpret_u8_u32(vld1_u32(src));
631         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
632 
633         /* get and hold the dst too */
634         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
635         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
636 
637 
638         /* get the alphas spread out properly */
639         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
640         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
641         /* we collapsed (255-a)+1 ... */
642         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
643 
644         /* spread the dest */
645         dst_wide = vmovl_u8(dst_raw);
646 
647         /* alpha mul the dest */
648         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
649         dst_cooked = vshrn_n_u16(dst_wide, 8);
650 
651         /* sum -- ignoring any byte lane overflows */
652         dst_final = vadd_u8(src_raw, dst_cooked);
653 
654         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
655         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
656         /* we collapsed (255-a)+1 ... */
657         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
658 
659         /* spread the dest */
660         dst_wide = vmovl_u8(dst_raw_2);
661 
662         /* alpha mul the dest */
663         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
664         dst_cooked = vshrn_n_u16(dst_wide, 8);
665 
666         /* sum -- ignoring any byte lane overflows */
667         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
668 
669         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
670         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
671 
672         src += UNROLL;
673         dst += UNROLL;
674 
675         /* if 2 of the next pixels aren't between 1 and 254
676         it might make sense to go to the optimized loops */
677         if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
678             break;
679 
680     } while(src < src_end);
681 
682     if (src >= src_end)
683         goto TAIL;
684 
685     if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
686         goto ALPHA_255;
687 
688     /*fall-thru*/
689 
690 ALPHA_0:
691 
692     /*In this state, we know the current alpha is 0 and
693      we optimize for the next alpha also being zero. */
694     src_temp = src;  //so we don't have to increment dst every time
695     do {
696         if(*(++src) > ALPHA_TRANS)
697             break;
698         if(*(++src) > ALPHA_TRANS)
699             break;
700         if(*(++src) > ALPHA_TRANS)
701             break;
702         if(*(++src) > ALPHA_TRANS)
703             break;
704     } while(src < src_end);
705 
706     dst += (src - src_temp);
707 
708     /* no longer alpha 0, so determine where to go next. */
709     if( src >= src_end)
710         goto TAIL;
711     if(*src >= ALPHA_OPAQ)
712         goto ALPHA_255;
713     else
714         goto ALPHA_1_TO_254;
715 
716 ALPHA_255:
717     while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
718         dst[0]=src[0];
719         dst[1]=src[1];
720         dst[2]=src[2];
721         dst[3]=src[3];
722         src+=UNROLL;
723         dst+=UNROLL;
724         if(src >= src_end)
725             goto TAIL;
726     }
727 
728     //Handle remainder.
729     if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
730         if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
731             if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
732         }
733     }
734 
735     if( src >= src_end)
736         goto TAIL;
737     if(*src <= ALPHA_TRANS)
738         goto ALPHA_0;
739     else
740         goto ALPHA_1_TO_254;
741 
742 TAIL:
743     /* do any residual iterations */
744     src_end += UNROLL + 1;  //goto the real end
745     while(src != src_end) {
746         if( *src != 0 ) {
747             if( *src >= ALPHA_OPAQ ) {
748                 *dst = *src;
749             }
750             else {
751                 *dst = SkPMSrcOver(*src, *dst);
752             }
753         }
754         src++;
755         dst++;
756     }
757 
758 #undef    UNROLL
759     return;
760 }
761 
762 /* Neon version of S32_Blend_BlitRow32()
763  * portable version is in src/core/SkBlitRow_D32.cpp
764  */
S32_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)765 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
766                               const SkPMColor* SK_RESTRICT src,
767                               int count, U8CPU alpha) {
768     SkASSERT(alpha <= 255);
769     if (count > 0) {
770         uint16_t src_scale = SkAlpha255To256(alpha);
771         uint16_t dst_scale = 256 - src_scale;
772 
773     /* run them N at a time through the NEON unit */
774     /* note that each 1 is 4 bytes, each treated exactly the same,
775      * so we can work under that guise. We *do* know that the src&dst
776      * will be 32-bit aligned quantities, so we can specify that on
777      * the load/store ops and do a neon 'reinterpret' to get us to
778      * byte-sized (pun intended) pieces that we widen/multiply/shift
779      * we're limited at 128 bits in the wide ops, which is 8x16bits
780      * or a pair of 32 bit src/dsts.
781      */
782     /* we *could* manually unroll this loop so that we load 128 bits
783      * (as a pair of 64s) from each of src and dst, processing them
784      * in pieces. This might give us a little better management of
785      * the memory latency, but my initial attempts here did not
786      * produce an instruction stream that looked all that nice.
787      */
788 #define    UNROLL    2
789     while (count >= UNROLL) {
790         uint8x8_t  src_raw, dst_raw, dst_final;
791         uint16x8_t  src_wide, dst_wide;
792 
793         /* get 64 bits of src, widen it, multiply by src_scale */
794         src_raw = vreinterpret_u8_u32(vld1_u32(src));
795         src_wide = vmovl_u8(src_raw);
796         /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
797         src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
798 
799         /* ditto with dst */
800         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
801         dst_wide = vmovl_u8(dst_raw);
802 
803         /* combine add with dst multiply into mul-accumulate */
804         dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
805 
806         dst_final = vshrn_n_u16(dst_wide, 8);
807         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
808 
809         src += UNROLL;
810         dst += UNROLL;
811         count -= UNROLL;
812     }
813     /* RBE: well, i don't like how gcc manages src/dst across the above
814      * loop it's constantly calculating src+bias, dst+bias and it only
815      * adjusts the real ones when we leave the loop. Not sure why
816      * it's "hoisting down" (hoisting implies above in my lexicon ;))
817      * the adjustments to src/dst/count, but it does...
818      * (might be SSA-style internal logic...
819      */
820 
821 #if    UNROLL == 2
822     if (count == 1) {
823             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
824     }
825 #else
826     if (count > 0) {
827             do {
828                 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
829                 src += 1;
830                 dst += 1;
831             } while (--count > 0);
832     }
833 #endif
834 
835 #undef    UNROLL
836     }
837 }
838 
S32A_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)839 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
840                          const SkPMColor* SK_RESTRICT src,
841                          int count, U8CPU alpha) {
842 
843     SkASSERT(255 >= alpha);
844 
845     if (count <= 0) {
846         return;
847     }
848 
849     unsigned alpha256 = SkAlpha255To256(alpha);
850 
851     // First deal with odd counts
852     if (count & 1) {
853         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
854         uint16x8_t vdst_wide, vsrc_wide;
855         unsigned dst_scale;
856 
857         // Load
858         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
859         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
860 
861         // Calc dst_scale
862         dst_scale = vget_lane_u8(vsrc, 3);
863         dst_scale *= alpha256;
864         dst_scale >>= 8;
865         dst_scale = 256 - dst_scale;
866 
867         // Process src
868         vsrc_wide = vmovl_u8(vsrc);
869         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
870 
871         // Process dst
872         vdst_wide = vmovl_u8(vdst);
873         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
874 
875         // Combine
876         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
877 
878         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
879         dst++;
880         src++;
881         count--;
882     }
883 
884     if (count) {
885         uint8x8_t alpha_mask;
886         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
887         alpha_mask = vld1_u8(alpha_mask_setup);
888 
889         do {
890 
891             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
892             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
893 
894             __builtin_prefetch(src+32);
895             __builtin_prefetch(dst+32);
896 
897             // Load
898             vsrc = vreinterpret_u8_u32(vld1_u32(src));
899             vdst = vreinterpret_u8_u32(vld1_u32(dst));
900 
901             // Prepare src_scale
902             vsrc_scale = vdupq_n_u16(alpha256);
903 
904             // Calc dst_scale
905             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
906             vdst_scale = vmovl_u8(vsrc_alphas);
907             vdst_scale *= vsrc_scale;
908             vdst_scale = vshrq_n_u16(vdst_scale, 8);
909             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
910 
911             // Process src
912             vsrc_wide = vmovl_u8(vsrc);
913             vsrc_wide *= vsrc_scale;
914 
915             // Process dst
916             vdst_wide = vmovl_u8(vdst);
917             vdst_wide *= vdst_scale;
918 
919             // Combine
920             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
921 
922             vst1_u32(dst, vreinterpret_u32_u8(vres));
923 
924             src += 2;
925             dst += 2;
926             count -= 2;
927         } while(count);
928     }
929 }
930 
931 ///////////////////////////////////////////////////////////////////////////////
932 
933 #undef    DEBUG_OPAQUE_DITHER
934 
935 #if    defined(DEBUG_OPAQUE_DITHER)
showme8(char * str,void * p,int len)936 static void showme8(char *str, void *p, int len)
937 {
938     static char buf[256];
939     char tbuf[32];
940     int i;
941     char *pc = (char*) p;
942     sprintf(buf,"%8s:", str);
943     for(i=0;i<len;i++) {
944         sprintf(tbuf, "   %02x", pc[i]);
945         strcat(buf, tbuf);
946     }
947     SkDebugf("%s\n", buf);
948 }
showme16(char * str,void * p,int len)949 static void showme16(char *str, void *p, int len)
950 {
951     static char buf[256];
952     char tbuf[32];
953     int i;
954     uint16_t *pc = (uint16_t*) p;
955     sprintf(buf,"%8s:", str);
956     len = (len / sizeof(uint16_t));    /* passed as bytes */
957     for(i=0;i<len;i++) {
958         sprintf(tbuf, " %04x", pc[i]);
959         strcat(buf, tbuf);
960     }
961     SkDebugf("%s\n", buf);
962 }
963 #endif
964 
S32A_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)965 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
966                                    const SkPMColor* SK_RESTRICT src,
967                                    int count, U8CPU alpha, int x, int y) {
968     SkASSERT(255 == alpha);
969 
970 #define    UNROLL    8
971 
972     if (count >= UNROLL) {
973     uint8x8_t dbase;
974 
975 #if    defined(DEBUG_OPAQUE_DITHER)
976     uint16_t tmpbuf[UNROLL];
977     int td[UNROLL];
978     int tdv[UNROLL];
979     int ta[UNROLL];
980     int tap[UNROLL];
981     uint16_t in_dst[UNROLL];
982     int offset = 0;
983     int noisy = 0;
984 #endif
985 
986     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
987     dbase = vld1_u8(dstart);
988 
989         do {
990         uint8x8_t sr, sg, sb, sa, d;
991         uint16x8_t dst8, scale8, alpha8;
992         uint16x8_t dst_r, dst_g, dst_b;
993 
994 #if    defined(DEBUG_OPAQUE_DITHER)
995     /* calculate 8 elements worth into a temp buffer */
996     {
997       int my_y = y;
998       int my_x = x;
999       SkPMColor* my_src = (SkPMColor*)src;
1000       uint16_t* my_dst = dst;
1001       int i;
1002 
1003           DITHER_565_SCAN(my_y);
1004           for(i=0;i<UNROLL;i++) {
1005             SkPMColor c = *my_src++;
1006             SkPMColorAssert(c);
1007             if (c) {
1008                 unsigned a = SkGetPackedA32(c);
1009 
1010                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1011         tdv[i] = DITHER_VALUE(my_x);
1012         ta[i] = a;
1013         tap[i] = SkAlpha255To256(a);
1014         td[i] = d;
1015 
1016                 unsigned sr = SkGetPackedR32(c);
1017                 unsigned sg = SkGetPackedG32(c);
1018                 unsigned sb = SkGetPackedB32(c);
1019                 sr = SkDITHER_R32_FOR_565(sr, d);
1020                 sg = SkDITHER_G32_FOR_565(sg, d);
1021                 sb = SkDITHER_B32_FOR_565(sb, d);
1022 
1023                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1024                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1025                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1026                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1027                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1028         td[i] = d;
1029 
1030             } else {
1031         tmpbuf[i] = *my_dst;
1032         ta[i] = tdv[i] = td[i] = 0xbeef;
1033         }
1034         in_dst[i] = *my_dst;
1035             my_dst += 1;
1036             DITHER_INC_X(my_x);
1037           }
1038     }
1039 #endif
1040 
1041         /* source is in ABGR */
1042         {
1043         register uint8x8_t d0 asm("d0");
1044         register uint8x8_t d1 asm("d1");
1045         register uint8x8_t d2 asm("d2");
1046         register uint8x8_t d3 asm("d3");
1047 
1048         asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1049             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1050             : "r" (src)
1051                     );
1052             sr = d0; sg = d1; sb = d2; sa = d3;
1053         }
1054 
1055         /* calculate 'd', which will be 0..7 */
1056         /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
1057 #if defined(SK_BUILD_FOR_ANDROID)
1058         /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1059         alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
1060 #else
1061         alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
1062 #endif
1063         alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
1064         d = vshrn_n_u16(alpha8, 8);    /* narrowing too */
1065 
1066         /* sr = sr - (sr>>5) + d */
1067         /* watching for 8-bit overflow.  d is 0..7; risky range of
1068          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1069          * safe  as long as we do ((sr-sr>>5) + d) */
1070         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1071         sr = vadd_u8(sr, d);
1072 
1073         /* sb = sb - (sb>>5) + d */
1074         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1075         sb = vadd_u8(sb, d);
1076 
1077         /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1078         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1079         sg = vadd_u8(sg, vshr_n_u8(d,1));
1080 
1081         /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
1082         dst8 = vld1q_u16(dst);
1083         dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
1084         dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
1085         dst_r = vshrq_n_u16(dst8,11);    /* clearing hi bits */
1086 
1087         /* blend */
1088 #if 1
1089         /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1090         /* originally 255-sa + 1 */
1091         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1092 #else
1093         scale8 = vsubw_u8(vdupq_n_u16(255), sa);
1094         scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
1095 #endif
1096 
1097 #if 1
1098         /* combine the addq and mul, save 3 insns */
1099         scale8 = vshrq_n_u16(scale8, 3);
1100         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1101         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1102         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1103 #else
1104         /* known correct, but +3 insns over above */
1105         scale8 = vshrq_n_u16(scale8, 3);
1106         dst_b = vmulq_u16(dst_b, scale8);
1107         dst_g = vmulq_u16(dst_g, scale8);
1108         dst_r = vmulq_u16(dst_r, scale8);
1109 
1110         /* combine */
1111         /* NB: vshll widens, need to preserve those bits */
1112         dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
1113         dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
1114         dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
1115 #endif
1116 
1117         /* repack to store */
1118         dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
1119         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1120         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1121 
1122         vst1q_u16(dst, dst8);
1123 
1124 #if    defined(DEBUG_OPAQUE_DITHER)
1125         /* verify my 8 elements match the temp buffer */
1126     {
1127        int i, bad=0;
1128        static int invocation;
1129 
1130        for (i=0;i<UNROLL;i++)
1131         if (tmpbuf[i] != dst[i]) bad=1;
1132        if (bad) {
1133         SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1134             invocation, offset);
1135         SkDebugf("  alpha 0x%x\n", alpha);
1136         for (i=0;i<UNROLL;i++)
1137             SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1138             i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
1139             dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
1140 
1141         showme16("alpha8", &alpha8, sizeof(alpha8));
1142         showme16("scale8", &scale8, sizeof(scale8));
1143         showme8("d", &d, sizeof(d));
1144         showme16("dst8", &dst8, sizeof(dst8));
1145         showme16("dst_b", &dst_b, sizeof(dst_b));
1146         showme16("dst_g", &dst_g, sizeof(dst_g));
1147         showme16("dst_r", &dst_r, sizeof(dst_r));
1148         showme8("sb", &sb, sizeof(sb));
1149         showme8("sg", &sg, sizeof(sg));
1150         showme8("sr", &sr, sizeof(sr));
1151 
1152         /* cop out */
1153         return;
1154        }
1155        offset += UNROLL;
1156        invocation++;
1157     }
1158 #endif
1159 
1160             dst += UNROLL;
1161         src += UNROLL;
1162         count -= UNROLL;
1163         /* skip x += UNROLL, since it's unchanged mod-4 */
1164         } while (count >= UNROLL);
1165     }
1166 #undef    UNROLL
1167 
1168     /* residuals */
1169     if (count > 0) {
1170         DITHER_565_SCAN(y);
1171         do {
1172             SkPMColor c = *src++;
1173             SkPMColorAssert(c);
1174             if (c) {
1175                 unsigned a = SkGetPackedA32(c);
1176 
1177                 // dither and alpha are just temporary variables to work-around
1178                 // an ICE in debug.
1179                 unsigned dither = DITHER_VALUE(x);
1180                 unsigned alpha = SkAlpha255To256(a);
1181                 int d = SkAlphaMul(dither, alpha);
1182 
1183                 unsigned sr = SkGetPackedR32(c);
1184                 unsigned sg = SkGetPackedG32(c);
1185                 unsigned sb = SkGetPackedB32(c);
1186                 sr = SkDITHER_R32_FOR_565(sr, d);
1187                 sg = SkDITHER_G32_FOR_565(sg, d);
1188                 sb = SkDITHER_B32_FOR_565(sb, d);
1189 
1190                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1191                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1192                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1193                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1194                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1195             }
1196             dst += 1;
1197             DITHER_INC_X(x);
1198         } while (--count != 0);
1199     }
1200 }
1201 
1202 ///////////////////////////////////////////////////////////////////////////////
1203 
1204 #undef    DEBUG_S32_OPAQUE_DITHER
1205 
S32_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1206 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1207                                  const SkPMColor* SK_RESTRICT src,
1208                                  int count, U8CPU alpha, int x, int y) {
1209     SkASSERT(255 == alpha);
1210 
1211 #define    UNROLL    8
1212     if (count >= UNROLL) {
1213     uint8x8_t d;
1214     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1215     d = vld1_u8(dstart);
1216 
1217     while (count >= UNROLL) {
1218         uint8x8_t sr, sg, sb;
1219         uint16x8_t dr, dg, db;
1220         uint16x8_t dst8;
1221 
1222         {
1223         register uint8x8_t d0 asm("d0");
1224         register uint8x8_t d1 asm("d1");
1225         register uint8x8_t d2 asm("d2");
1226         register uint8x8_t d3 asm("d3");
1227 
1228         asm (
1229             "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1230             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1231             :
1232         );
1233         sg = d1;
1234 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1235         sr = d2; sb = d0;
1236 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1237         sr = d0; sb = d2;
1238 #endif
1239         }
1240         /* XXX: if we want to prefetch, hide it in the above asm()
1241          * using the gcc __builtin_prefetch(), the prefetch will
1242          * fall to the bottom of the loop -- it won't stick up
1243          * at the top of the loop, just after the vld4.
1244          */
1245 
1246         // sr = sr - (sr>>5) + d
1247         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1248         dr = vaddl_u8(sr, d);
1249 
1250         // sb = sb - (sb>>5) + d
1251         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1252         db = vaddl_u8(sb, d);
1253 
1254         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1255         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1256         dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1257 
1258         // pack high bits of each into 565 format  (rgb, b is lsb)
1259         dst8 = vshrq_n_u16(db, 3);
1260         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1261         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1262 
1263         // store it
1264         vst1q_u16(dst, dst8);
1265 
1266 #if    defined(DEBUG_S32_OPAQUE_DITHER)
1267         // always good to know if we generated good results
1268         {
1269         int i, myx = x, myy = y;
1270         DITHER_565_SCAN(myy);
1271         for (i=0;i<UNROLL;i++) {
1272             // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1273             SkPMColor c = src[i-8];
1274             unsigned dither = DITHER_VALUE(myx);
1275             uint16_t val = SkDitherRGB32To565(c, dither);
1276             if (val != dst[i]) {
1277             SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1278                 c, dither, val, dst[i], dstart[i]);
1279             }
1280             DITHER_INC_X(myx);
1281         }
1282         }
1283 #endif
1284 
1285         dst += UNROLL;
1286         // we don't need to increment src as the asm above has already done it
1287         count -= UNROLL;
1288         x += UNROLL;        // probably superfluous
1289     }
1290     }
1291 #undef    UNROLL
1292 
1293     // residuals
1294     if (count > 0) {
1295         DITHER_565_SCAN(y);
1296         do {
1297             SkPMColor c = *src++;
1298             SkPMColorAssert(c);
1299             SkASSERT(SkGetPackedA32(c) == 255);
1300 
1301             unsigned dither = DITHER_VALUE(x);
1302             *dst++ = SkDitherRGB32To565(c, dither);
1303             DITHER_INC_X(x);
1304         } while (--count != 0);
1305     }
1306 }
1307 
Color32_arm_neon(SkPMColor * dst,const SkPMColor * src,int count,SkPMColor color)1308 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1309                       SkPMColor color) {
1310     if (count <= 0) {
1311         return;
1312     }
1313 
1314     if (0 == color) {
1315         if (src != dst) {
1316             memcpy(dst, src, count * sizeof(SkPMColor));
1317         }
1318         return;
1319     }
1320 
1321     unsigned colorA = SkGetPackedA32(color);
1322     if (255 == colorA) {
1323         sk_memset32(dst, color, count);
1324     } else {
1325         unsigned scale = 256 - SkAlpha255To256(colorA);
1326 
1327         if (count >= 8) {
1328             // at the end of this assembly, count will have been decremented
1329             // to a negative value. That is, if count mod 8 = x, it will be
1330             // -8 +x coming out.
1331             asm volatile (
1332                 PLD128(src, 0)
1333 
1334                 "vdup.32    q0, %[color]                \n\t"
1335 
1336                 PLD128(src, 128)
1337 
1338                 // scale numerical interval [0-255], so load as 8 bits
1339                 "vdup.8     d2, %[scale]                \n\t"
1340 
1341                 PLD128(src, 256)
1342 
1343                 "subs       %[count], %[count], #8      \n\t"
1344 
1345                 PLD128(src, 384)
1346 
1347                 "Loop_Color32:                          \n\t"
1348 
1349                 // load src color, 8 pixels, 4 64 bit registers
1350                 // (and increment src).
1351                 "vld1.32    {d4-d7}, [%[src]]!          \n\t"
1352 
1353                 PLD128(src, 384)
1354 
1355                 // multiply long by scale, 64 bits at a time,
1356                 // destination into a 128 bit register.
1357                 "vmull.u8   q4, d4, d2                  \n\t"
1358                 "vmull.u8   q5, d5, d2                  \n\t"
1359                 "vmull.u8   q6, d6, d2                  \n\t"
1360                 "vmull.u8   q7, d7, d2                  \n\t"
1361 
1362                 // shift the 128 bit registers, containing the 16
1363                 // bit scaled values back to 8 bits, narrowing the
1364                 // results to 64 bit registers.
1365                 "vshrn.i16  d8, q4, #8                  \n\t"
1366                 "vshrn.i16  d9, q5, #8                  \n\t"
1367                 "vshrn.i16  d10, q6, #8                 \n\t"
1368                 "vshrn.i16  d11, q7, #8                 \n\t"
1369 
1370                 // adding back the color, using 128 bit registers.
1371                 "vadd.i8    q6, q4, q0                  \n\t"
1372                 "vadd.i8    q7, q5, q0                  \n\t"
1373 
1374                 // store back the 8 calculated pixels (2 128 bit
1375                 // registers), and increment dst.
1376                 "vst1.32    {d12-d15}, [%[dst]]!        \n\t"
1377 
1378                 "subs       %[count], %[count], #8      \n\t"
1379                 "bge        Loop_Color32                \n\t"
1380                 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
1381                 : [color] "r" (color), [scale] "r" (scale)
1382                 : "cc", "memory",
1383                   "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1384                   "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
1385                           );
1386             // At this point, if we went through the inline assembly, count is
1387             // a negative value:
1388             // if the value is -8, there is no pixel left to process.
1389             // if the value is -7, there is one pixel left to process
1390             // ...
1391             // And'ing it with 7 will give us the number of pixels
1392             // left to process.
1393             count = count & 0x7;
1394         }
1395 
1396         while (count > 0) {
1397             *dst = color + SkAlphaMulQ(*src, scale);
1398             src += 1;
1399             dst += 1;
1400             count--;
1401         }
1402     }
1403 }
1404 
1405 ///////////////////////////////////////////////////////////////////////////////
1406 
1407 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1408     // no dither
1409     // NOTE: For the S32_D565_Blend function below, we don't have a special
1410     //       version that assumes that each source pixel is opaque. But our
1411     //       S32A is still faster than the default, so use it.
1412     S32_D565_Opaque_neon,
1413     S32A_D565_Blend_neon,   // really S32_D565_Blend
1414     S32A_D565_Opaque_neon,
1415     S32A_D565_Blend_neon,
1416 
1417     // dither
1418     S32_D565_Opaque_Dither_neon,
1419     S32_D565_Blend_Dither_neon,
1420     S32A_D565_Opaque_Dither_neon,
1421     NULL,   // S32A_D565_Blend_Dither
1422 };
1423 
1424 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1425     NULL,   // S32_Opaque,
1426     S32_Blend_BlitRow32_neon,        // S32_Blend,
1427     /*
1428      * We have two choices for S32A_Opaque procs. The one reads the src alpha
1429      * value and attempts to optimize accordingly.  The optimization is
1430      * sensitive to the source content and is not a win in all cases. For
1431      * example, if there are a lot of transitions between the alpha states,
1432      * the performance will almost certainly be worse.  However, for many
1433      * common cases the performance is equivalent or better than the standard
1434      * case where we do not inspect the src alpha.
1435      */
1436 #if SK_A32_SHIFT == 24
1437     // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1438     S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1439 #else
1440     S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1441 #endif
1442     S32A_Blend_BlitRow32_neon        // S32A_Blend
1443 };
1444