• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **
3  ** Copyright 2009, The Android Open Source Project
4  **
5  ** Licensed under the Apache License, Version 2.0 (the "License");
6  ** you may not use this file except in compliance with the License.
7  ** You may obtain a copy of the License at
8  **
9  **     http://www.apache.org/licenses/LICENSE-2.0
10  **
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  */
17 
18 #ifdef ANDROID
19     #include <machine/cpu-features.h>
20 #endif
21 
22 #include "SkBlitRow.h"
23 #include "SkColorPriv.h"
24 #include "SkDither.h"
25 
26 #if defined(__ARM_HAVE_NEON)
27 #include <arm_neon.h>
28 #endif
29 
30 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
S32A_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)31 static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
32                                   const SkPMColor* SK_RESTRICT src, int count,
33                                   U8CPU alpha, int /*x*/, int /*y*/) {
34     SkASSERT(255 == alpha);
35 
36     if (count >= 8) {
37         uint16_t* SK_RESTRICT keep_dst;
38 
39         asm volatile (
40                       "ands       ip, %[count], #7            \n\t"
41                       "vmov.u8    d31, #1<<7                  \n\t"
42                       "vld1.16    {q12}, [%[dst]]             \n\t"
43                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
44                       "moveq      ip, #8                      \n\t"
45                       "mov        %[keep_dst], %[dst]         \n\t"
46 
47                       "add        %[src], %[src], ip, LSL#2   \n\t"
48                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
49                       "subs       %[count], %[count], ip      \n\t"
50                       "b          9f                          \n\t"
51                       // LOOP
52                       "2:                                         \n\t"
53 
54                       "vld1.16    {q12}, [%[dst]]!            \n\t"
55                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
56                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
57                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
58                       "subs       %[count], %[count], #8      \n\t"
59                       "9:                                         \n\t"
60                       "pld        [%[dst],#32]                \n\t"
61                       // expand 0565 q12 to 8888 {d4-d7}
62                       "vmovn.u16  d4, q12                     \n\t"
63                       "vshr.u16   q11, q12, #5                \n\t"
64                       "vshr.u16   q10, q12, #6+5              \n\t"
65                       "vmovn.u16  d5, q11                     \n\t"
66                       "vmovn.u16  d6, q10                     \n\t"
67                       "vshl.u8    d4, d4, #3                  \n\t"
68                       "vshl.u8    d5, d5, #2                  \n\t"
69                       "vshl.u8    d6, d6, #3                  \n\t"
70 
71                       "vmovl.u8   q14, d31                    \n\t"
72                       "vmovl.u8   q13, d31                    \n\t"
73                       "vmovl.u8   q12, d31                    \n\t"
74 
75                       // duplicate in 4/2/1 & 8pix vsns
76                       "vmvn.8     d30, d3                     \n\t"
77                       "vmlal.u8   q14, d30, d6                \n\t"
78                       "vmlal.u8   q13, d30, d5                \n\t"
79                       "vmlal.u8   q12, d30, d4                \n\t"
80                       "vshr.u16   q8, q14, #5                 \n\t"
81                       "vshr.u16   q9, q13, #6                 \n\t"
82                       "vaddhn.u16 d6, q14, q8                 \n\t"
83                       "vshr.u16   q8, q12, #5                 \n\t"
84                       "vaddhn.u16 d5, q13, q9                 \n\t"
85                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
86                       "vaddhn.u16 d4, q12, q8                 \n\t"
87                       // intentionally don't calculate alpha
88                       // result in d4-d6
89 
90                       "vqadd.u8   d5, d5, d1                  \n\t"
91                       "vqadd.u8   d4, d4, d2                  \n\t"
92 
93                       // pack 8888 {d4-d6} to 0565 q10
94                       "vshll.u8   q10, d6, #8                 \n\t"
95                       "vshll.u8   q3, d5, #8                  \n\t"
96                       "vshll.u8   q2, d4, #8                  \n\t"
97                       "vsri.u16   q10, q3, #5                 \n\t"
98                       "vsri.u16   q10, q2, #11                \n\t"
99 
100                       "bne        2b                          \n\t"
101 
102                       "1:                                         \n\t"
103                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
104                       : [count] "+r" (count)
105                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
106                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
107                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
108                       "d30","d31"
109                       );
110     }
111     else
112     {   // handle count < 8
113         uint16_t* SK_RESTRICT keep_dst;
114 
115         asm volatile (
116                       "vmov.u8    d31, #1<<7                  \n\t"
117                       "mov        %[keep_dst], %[dst]         \n\t"
118 
119                       "tst        %[count], #4                \n\t"
120                       "beq        14f                         \n\t"
121                       "vld1.16    {d25}, [%[dst]]!            \n\t"
122                       "vld1.32    {q1}, [%[src]]!             \n\t"
123 
124                       "14:                                        \n\t"
125                       "tst        %[count], #2                \n\t"
126                       "beq        12f                         \n\t"
127                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
128                       "vld1.32    {d1}, [%[src]]!             \n\t"
129 
130                       "12:                                        \n\t"
131                       "tst        %[count], #1                \n\t"
132                       "beq        11f                         \n\t"
133                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
134                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
135 
136                       "11:                                        \n\t"
137                       // unzips achieve the same as a vld4 operation
138                       "vuzpq.u16  q0, q1                      \n\t"
139                       "vuzp.u8    d0, d1                      \n\t"
140                       "vuzp.u8    d2, d3                      \n\t"
141                       // expand 0565 q12 to 8888 {d4-d7}
142                       "vmovn.u16  d4, q12                     \n\t"
143                       "vshr.u16   q11, q12, #5                \n\t"
144                       "vshr.u16   q10, q12, #6+5              \n\t"
145                       "vmovn.u16  d5, q11                     \n\t"
146                       "vmovn.u16  d6, q10                     \n\t"
147                       "vshl.u8    d4, d4, #3                  \n\t"
148                       "vshl.u8    d5, d5, #2                  \n\t"
149                       "vshl.u8    d6, d6, #3                  \n\t"
150 
151                       "vmovl.u8   q14, d31                    \n\t"
152                       "vmovl.u8   q13, d31                    \n\t"
153                       "vmovl.u8   q12, d31                    \n\t"
154 
155                       // duplicate in 4/2/1 & 8pix vsns
156                       "vmvn.8     d30, d3                     \n\t"
157                       "vmlal.u8   q14, d30, d6                \n\t"
158                       "vmlal.u8   q13, d30, d5                \n\t"
159                       "vmlal.u8   q12, d30, d4                \n\t"
160                       "vshr.u16   q8, q14, #5                 \n\t"
161                       "vshr.u16   q9, q13, #6                 \n\t"
162                       "vaddhn.u16 d6, q14, q8                 \n\t"
163                       "vshr.u16   q8, q12, #5                 \n\t"
164                       "vaddhn.u16 d5, q13, q9                 \n\t"
165                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
166                       "vaddhn.u16 d4, q12, q8                 \n\t"
167                       // intentionally don't calculate alpha
168                       // result in d4-d6
169 
170                       "vqadd.u8   d5, d5, d1                  \n\t"
171                       "vqadd.u8   d4, d4, d2                  \n\t"
172 
173                       // pack 8888 {d4-d6} to 0565 q10
174                       "vshll.u8   q10, d6, #8                 \n\t"
175                       "vshll.u8   q3, d5, #8                  \n\t"
176                       "vshll.u8   q2, d4, #8                  \n\t"
177                       "vsri.u16   q10, q3, #5                 \n\t"
178                       "vsri.u16   q10, q2, #11                \n\t"
179 
180                       // store
181                       "tst        %[count], #4                \n\t"
182                       "beq        24f                         \n\t"
183                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
184 
185                       "24:                                        \n\t"
186                       "tst        %[count], #2                \n\t"
187                       "beq        22f                         \n\t"
188                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
189 
190                       "22:                                        \n\t"
191                       "tst        %[count], #1                \n\t"
192                       "beq        21f                         \n\t"
193                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
194 
195                       "21:                                        \n\t"
196                       : [count] "+r" (count)
197                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
198                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
199                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
200                       "d30","d31"
201                       );
202     }
203 }
204 
S32A_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)205 static void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
206                                  const SkPMColor* SK_RESTRICT src, int count,
207                                  U8CPU alpha, int /*x*/, int /*y*/) {
208 
209     U8CPU alpha_for_asm = alpha;
210 
211     asm volatile (
212     /* This code implements a Neon version of S32A_D565_Blend. The output differs from
213      * the original in two respects:
214      *  1. The results have a few mismatches compared to the original code. These mismatches
215      *     never exceed 1. It's possible to improve accuracy vs. a floating point
216      *     implementation by introducing rounding right shifts (vrshr) for the final stage.
217      *     Rounding is not present in the code below, because although results would be closer
218      *     to a floating point implementation, the number of mismatches compared to the
219      *     original code would be far greater.
220      *  2. On certain inputs, the original code can overflow, causing colour channels to
221      *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
222      *     to affect another.
223      */
224 
225 #if 1
226 		/* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
227                   "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
228 #else
229                   "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
230 #endif
231                   "vmov.u16   q3, #255                        \n\t"   // set up constant
232                   "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
233                   "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
234                   "beq        2f                              \n\t"   // if count8 == 0, exit
235                   "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
236 
237                   "1:                                             \n\t"
238                   "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
239                   "subs       r4, r4, #1                      \n\t"   // decrement loop counter
240                   "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
241                   //  and deinterleave
242 
243                   "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
244                   "vand       q10, q0, q15                    \n\t"   // extract blue
245                   "vshr.u16   q8, q0, #11                     \n\t"   // extract red
246                   "vshr.u16   q9, q9, #10                     \n\t"   // extract green
247                   // dstrgb = {q8, q9, q10}
248 
249                   "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
250                   "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
251                   "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
252 
253                   "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
254                   "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
255                   "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
256                   "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
257                   // srcrgba = {q11, q12, q13, q14}
258 
259                   "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
260                   "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
261                   "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
262                   "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
263 
264                   "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
265                   "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
266                   // dst_scale = q2
267 
268                   "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
269                   "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
270                   "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
271 
272 #if 1
273 	// trying for a better match with SkDiv255Round(a)
274 	// C alg is:  a+=128; (a+a>>8)>>8
275 	// we'll use just a rounding shift [q2 is available for scratch]
276                   "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
277                   "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
278                   "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
279 #else
280 	// arm's original "truncating divide by 256"
281                   "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
282                   "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
283                   "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
284 #endif
285 
286                   "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
287                   "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
288                   "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
289 
290                   "bne        1b                              \n\t"   // if counter != 0, loop
291                   "2:                                             \n\t"   // exit
292 
293                   : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
294                   :
295                   : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
296                   );
297 
298     count &= 7;
299     if (count > 0) {
300         do {
301             SkPMColor sc = *src++;
302             if (sc) {
303                 uint16_t dc = *dst;
304                 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
305                 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
306                 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
307                 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
308                 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
309             }
310             dst += 1;
311         } while (--count != 0);
312     }
313 }
314 
315 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
316  * each dither value is spaced out into byte lanes, and repeated
317  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
318  * start of each row.
319  */
320 static const uint8_t gDitherMatrix_Neon[48] = {
321     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
322     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
323     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
324     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
325 
326 };
327 
S32_D565_Blend_Dither_neon(uint16_t * dst,const SkPMColor * src,int count,U8CPU alpha,int x,int y)328 static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
329                                        int count, U8CPU alpha, int x, int y)
330 {
331     /* select row and offset for dither array */
332     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
333 
334     /* rescale alpha to range 0 - 256 */
335     int scale = SkAlpha255To256(alpha);
336 
337     asm volatile (
338                   "vld1.8         {d31}, [%[dstart]]              \n\t"   // load dither values
339                   "vshr.u8        d30, d31, #1                    \n\t"   // calc. green dither values
340                   "vdup.16        d6, %[scale]                    \n\t"   // duplicate scale into neon reg
341                   "vmov.i8        d29, #0x3f                      \n\t"   // set up green mask
342                   "vmov.i8        d28, #0x1f                      \n\t"   // set up blue mask
343                   "1:                                                 \n\t"
344                   "vld4.8         {d0, d1, d2, d3}, [%[src]]!     \n\t"   // load 8 pixels and split into argb
345                   "vshr.u8        d22, d0, #5                     \n\t"   // calc. red >> 5
346                   "vshr.u8        d23, d1, #6                     \n\t"   // calc. green >> 6
347                   "vshr.u8        d24, d2, #5                     \n\t"   // calc. blue >> 5
348                   "vaddl.u8       q8, d0, d31                     \n\t"   // add in dither to red and widen
349                   "vaddl.u8       q9, d1, d30                     \n\t"   // add in dither to green and widen
350                   "vaddl.u8       q10, d2, d31                    \n\t"   // add in dither to blue and widen
351                   "vsubw.u8       q8, q8, d22                     \n\t"   // sub shifted red from result
352                   "vsubw.u8       q9, q9, d23                     \n\t"   // sub shifted green from result
353                   "vsubw.u8       q10, q10, d24                   \n\t"   // sub shifted blue from result
354                   "vshrn.i16      d22, q8, #3                     \n\t"   // shift right and narrow to 5 bits
355                   "vshrn.i16      d23, q9, #2                     \n\t"   // shift right and narrow to 6 bits
356                   "vshrn.i16      d24, q10, #3                    \n\t"   // shift right and narrow to 5 bits
357                   // load 8 pixels from dst, extract rgb
358                   "vld1.16        {d0, d1}, [%[dst]]              \n\t"   // load 8 pixels
359                   "vshrn.i16      d17, q0, #5                     \n\t"   // shift green down to bottom 6 bits
360                   "vmovn.i16      d18, q0                         \n\t"   // narrow to get blue as bytes
361                   "vshr.u16       q0, q0, #11                     \n\t"   // shift down to extract red
362                   "vand           d17, d17, d29                   \n\t"   // and green with green mask
363                   "vand           d18, d18, d28                   \n\t"   // and blue with blue mask
364                   "vmovn.i16      d16, q0                         \n\t"   // narrow to get red as bytes
365                   // src = {d22 (r), d23 (g), d24 (b)}
366                   // dst = {d16 (r), d17 (g), d18 (b)}
367                   // subtract dst from src and widen
368                   "vsubl.s8       q0, d22, d16                    \n\t"   // subtract red src from dst
369                   "vsubl.s8       q1, d23, d17                    \n\t"   // subtract green src from dst
370                   "vsubl.s8       q2, d24, d18                    \n\t"   // subtract blue src from dst
371                   // multiply diffs by scale and shift
372                   "vmul.i16       q0, q0, d6[0]                   \n\t"   // multiply red by scale
373                   "vmul.i16       q1, q1, d6[0]                   \n\t"   // multiply blue by scale
374                   "vmul.i16       q2, q2, d6[0]                   \n\t"   // multiply green by scale
375                   "subs           %[count], %[count], #8          \n\t"   // decrement loop counter
376                   "vshrn.i16      d0, q0, #8                      \n\t"   // shift down red by 8 and narrow
377                   "vshrn.i16      d2, q1, #8                      \n\t"   // shift down green by 8 and narrow
378                   "vshrn.i16      d4, q2, #8                      \n\t"   // shift down blue by 8 and narrow
379                   // add dst to result
380                   "vaddl.s8       q0, d0, d16                     \n\t"   // add dst to red
381                   "vaddl.s8       q1, d2, d17                     \n\t"   // add dst to green
382                   "vaddl.s8       q2, d4, d18                     \n\t"   // add dst to blue
383                   // put result into 565 format
384                   "vsli.i16       q2, q1, #5                      \n\t"   // shift up green and insert into blue
385                   "vsli.i16       q2, q0, #11                     \n\t"   // shift up red and insert into blue
386                   "vst1.16        {d4, d5}, [%[dst]]!             \n\t"   // store result
387                   "bgt            1b                              \n\t"   // loop if count > 0
388                   : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
389                   : [dstart] "r" (dstart), [scale] "r" (scale)
390                   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
391                   );
392 
393     DITHER_565_SCAN(y);
394 
395     while((count & 7) > 0)
396     {
397         SkPMColor c = *src++;
398 
399         int dither = DITHER_VALUE(x);
400         int sr = SkGetPackedR32(c);
401         int sg = SkGetPackedG32(c);
402         int sb = SkGetPackedB32(c);
403         sr = SkDITHER_R32To565(sr, dither);
404         sg = SkDITHER_G32To565(sg, dither);
405         sb = SkDITHER_B32To565(sb, dither);
406 
407         uint16_t d = *dst;
408         *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
409                              SkAlphaBlend(sg, SkGetPackedG16(d), scale),
410                              SkAlphaBlend(sb, SkGetPackedB16(d), scale));
411         DITHER_INC_X(x);
412         count--;
413     }
414 }
415 
416 #define S32A_D565_Opaque_PROC       S32A_D565_Opaque_neon
417 #define S32A_D565_Blend_PROC        S32A_D565_Blend_neon
418 #define S32_D565_Blend_Dither_PROC  S32_D565_Blend_Dither_neon
419 #else
420 #define S32A_D565_Opaque_PROC       NULL
421 #define S32A_D565_Blend_PROC        NULL
422 #define S32_D565_Blend_Dither_PROC  NULL
423 #endif
424 
425 /* Don't have a special version that assumes each src is opaque, but our S32A
426     is still faster than the default, so use it here
427  */
428 #define S32_D565_Opaque_PROC    S32A_D565_Opaque_PROC
429 #define S32_D565_Blend_PROC     S32A_D565_Blend_PROC
430 
431 ///////////////////////////////////////////////////////////////////////////////
432 
433 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
434 
S32A_Opaque_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)435 static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
436                                   const SkPMColor* SK_RESTRICT src,
437                                   int count, U8CPU alpha) {
438 
439     SkASSERT(255 == alpha);
440     if (count > 0) {
441 
442 
443 	uint8x8_t alpha_mask;
444 
445 	static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
446 	alpha_mask = vld1_u8(alpha_mask_setup);
447 
448 	/* do the NEON unrolled code */
449 #define	UNROLL	4
450 	while (count >= UNROLL) {
451 	    uint8x8_t src_raw, dst_raw, dst_final;
452 	    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
453 
454 	    /* get the source */
455 	    src_raw = vreinterpret_u8_u32(vld1_u32(src));
456 #if	UNROLL > 2
457 	    src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
458 #endif
459 
460 	    /* get and hold the dst too */
461 	    dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
462 #if	UNROLL > 2
463 	    dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
464 #endif
465 
466 	/* 1st and 2nd bits of the unrolling */
467 	{
468 	    uint8x8_t dst_cooked;
469 	    uint16x8_t dst_wide;
470 	    uint8x8_t alpha_narrow;
471 	    uint16x8_t alpha_wide;
472 
473 	    /* get the alphas spread out properly */
474 	    alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
475 #if 1
476 	    /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
477 	    /* we collapsed (255-a)+1 ... */
478 	    alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
479 #else
480 	    alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
481 	    alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
482 #endif
483 
484 	    /* spread the dest */
485 	    dst_wide = vmovl_u8(dst_raw);
486 
487 	    /* alpha mul the dest */
488 	    dst_wide = vmulq_u16 (dst_wide, alpha_wide);
489 	    dst_cooked = vshrn_n_u16(dst_wide, 8);
490 
491 	    /* sum -- ignoring any byte lane overflows */
492 	    dst_final = vadd_u8(src_raw, dst_cooked);
493 	}
494 
495 #if	UNROLL > 2
496 	/* the 3rd and 4th bits of our unrolling */
497 	{
498 	    uint8x8_t dst_cooked;
499 	    uint16x8_t dst_wide;
500 	    uint8x8_t alpha_narrow;
501 	    uint16x8_t alpha_wide;
502 
503 	    alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
504 #if 1
505 	    /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
506 	    /* we collapsed (255-a)+1 ... */
507 	    alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
508 #else
509 	    alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
510 	    alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
511 #endif
512 
513 	    /* spread the dest */
514 	    dst_wide = vmovl_u8(dst_raw_2);
515 
516 	    /* alpha mul the dest */
517 	    dst_wide = vmulq_u16 (dst_wide, alpha_wide);
518 	    dst_cooked = vshrn_n_u16(dst_wide, 8);
519 
520 	    /* sum -- ignoring any byte lane overflows */
521 	    dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
522 	}
523 #endif
524 
525 	    vst1_u32(dst, vreinterpret_u32_u8(dst_final));
526 #if	UNROLL > 2
527 	    vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
528 #endif
529 
530 	    src += UNROLL;
531 	    dst += UNROLL;
532 	    count -= UNROLL;
533 	}
534 #undef	UNROLL
535 
536 	/* do any residual iterations */
537         while (--count >= 0) {
538 #ifdef TEST_SRC_ALPHA
539             SkPMColor sc = *src;
540             if (sc) {
541                 unsigned srcA = SkGetPackedA32(sc);
542                 SkPMColor result = sc;
543                 if (srcA != 255) {
544                     result = SkPMSrcOver(sc, *dst);
545                 }
546                 *dst = result;
547             }
548 #else
549             *dst = SkPMSrcOver(*src, *dst);
550 #endif
551             src += 1;
552             dst += 1;
553         }
554     }
555 }
556 
557 #define	S32A_Opaque_BlitRow32_PROC	S32A_Opaque_BlitRow32_neon
558 
559 #else
560 
561 #ifdef TEST_SRC_ALPHA
562 #error The ARM asm version of S32A_Opaque_BlitRow32 does not support TEST_SRC_ALPHA
563 #endif
564 
S32A_Opaque_BlitRow32_arm(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)565 static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
566                                   const SkPMColor* SK_RESTRICT src,
567                                   int count, U8CPU alpha) {
568 
569     SkASSERT(255 == alpha);
570 
571     /* Does not support the TEST_SRC_ALPHA case */
572     asm volatile (
573                   "cmp    %[count], #0               \n\t" /* comparing count with 0 */
574                   "beq    3f                         \n\t" /* if zero exit */
575 
576                   "mov    ip, #0xff                  \n\t" /* load the 0xff mask in ip */
577                   "orr    ip, ip, ip, lsl #16        \n\t" /* convert it to 0xff00ff in ip */
578 
579                   "cmp    %[count], #2               \n\t" /* compare count with 2 */
580                   "blt    2f                         \n\t" /* if less than 2 -> single loop */
581 
582                   /* Double Loop */
583                   "1:                                \n\t" /* <double loop> */
584                   "ldm    %[src]!, {r5,r6}           \n\t" /* load the src(s) at r5-r6 */
585                   "ldm    %[dst], {r7,r8}            \n\t" /* loading dst(s) into r7-r8 */
586                   "lsr    r4, r5, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
587 
588                   /* ----------- */
589                   "and    r9, ip, r7                 \n\t" /* r9 = br masked by ip */
590                   "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 256 -> r4=scale */
591                   "and    r10, ip, r7, lsr #8        \n\t" /* r10 = ag masked by ip */
592 
593                   "mul    r9, r9, r4                 \n\t" /* br = br * scale */
594                   "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
595                   "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
596 
597                   "and    r10, r10, ip, lsl #8       \n\t" /* mask ag with reverse mask */
598                   "lsr    r4, r6, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
599                   "orr    r7, r9, r10                \n\t" /* br | ag*/
600 
601                   "add    r7, r5, r7                 \n\t" /* dst = src + calc dest(r7) */
602                   "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 255 -> r4=scale */
603 
604                   /* ----------- */
605                   "and    r9, ip, r8                 \n\t" /* r9 = br masked by ip */
606 
607                   "and    r10, ip, r8, lsr #8        \n\t" /* r10 = ag masked by ip */
608                   "mul    r9, r9, r4                 \n\t" /* br = br * scale */
609                   "sub    %[count], %[count], #2     \n\t"
610                   "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
611 
612                   "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
613                   "and    r10, r10, ip, lsl #8       \n\t" /* mask ag with reverse mask */
614                   "cmp    %[count], #1               \n\t" /* comparing count with 1 */
615                   "orr    r8, r9, r10                \n\t" /* br | ag */
616 
617                   "add    r8, r6, r8                 \n\t" /* dst = src + calc dest(r8) */
618 
619                   /* ----------------- */
620                   "stm    %[dst]!, {r7,r8}           \n\t" /* *dst = r7, increment dst by two (each times 4) */
621                   /* ----------------- */
622 
623                   "bgt    1b                         \n\t" /* if greater than 1 -> reloop */
624                   "blt    3f                         \n\t" /* if less than 1 -> exit */
625 
626                   /* Single Loop */
627                   "2:                                \n\t" /* <single loop> */
628                   "ldr    r5, [%[src]], #4           \n\t" /* load the src pointer into r5 r5=src */
629                   "ldr    r7, [%[dst]]               \n\t" /* loading dst into r7 */
630                   "lsr    r4, r5, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
631 
632                   /* ----------- */
633                   "and    r9, ip, r7                 \n\t" /* r9 = br masked by ip */
634                   "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 256 -> r4=scale */
635 
636                   "and    r10, ip, r7, lsr #8        \n\t" /* r10 = ag masked by ip */
637                   "mul    r9, r9, r4                 \n\t" /* br = br * scale */
638                   "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
639                   "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
640 
641                   "and    r10, r10, ip, lsl #8       \n\t" /* mask ag */
642                   "orr    r7, r9, r10                \n\t" /* br | ag */
643 
644                   "add    r7, r5, r7                 \n\t" /* *dst = src + calc dest(r7) */
645 
646                   /* ----------------- */
647                   "str    r7, [%[dst]], #4           \n\t" /* *dst = r7, increment dst by one (times 4) */
648                   /* ----------------- */
649 
650                   "3:                                \n\t" /* <exit> */
651                   : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
652                   :
653                   : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory"
654                   );
655 }
656 #define	S32A_Opaque_BlitRow32_PROC	S32A_Opaque_BlitRow32_arm
657 #endif
658 
659 /*
660  * ARM asm version of S32A_Blend_BlitRow32
661  */
S32A_Blend_BlitRow32_arm(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)662 static void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
663                                  const SkPMColor* SK_RESTRICT src,
664                                  int count, U8CPU alpha) {
665     asm volatile (
666                   "cmp    %[count], #0               \n\t" /* comparing count with 0 */
667                   "beq    3f                         \n\t" /* if zero exit */
668 
669                   "mov    r12, #0xff                 \n\t" /* load the 0xff mask in r12 */
670                   "orr    r12, r12, r12, lsl #16     \n\t" /* convert it to 0xff00ff in r12 */
671 
672                   /* src1,2_scale */
673                   "add    %[alpha], %[alpha], #1     \n\t" /* loading %[alpha]=src_scale=alpha+1 */
674 
675                   "cmp    %[count], #2               \n\t" /* comparing count with 2 */
676                   "blt    2f                         \n\t" /* if less than 2 -> single loop */
677 
678                   /* Double Loop */
679                   "1:                                \n\t" /* <double loop> */
680                   "ldm    %[src]!, {r5, r6}          \n\t" /* loading src pointers into r5 and r6 */
681                   "ldm    %[dst], {r7, r8}           \n\t" /* loading dst pointers into r7 and r8 */
682 
683                   /* dst1_scale and dst2_scale*/
684                   "lsr    r9, r5, #24                \n\t" /* src >> 24 */
685                   "lsr    r10, r6, #24               \n\t" /* src >> 24 */
686                   "smulbb r9, r9, %[alpha]           \n\t" /* r9 = SkMulS16 r9 with src_scale */
687                   "smulbb r10, r10, %[alpha]         \n\t" /* r10 = SkMulS16 r10 with src_scale */
688                   "lsr    r9, r9, #8                 \n\t" /* r9 >> 8 */
689                   "lsr    r10, r10, #8               \n\t" /* r10 >> 8 */
690                   "rsb    r9, r9, #256               \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */
691                   "rsb    r10, r10, #256             \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */
692 
693                   /* ---------------------- */
694 
695                   /* src1, src1_scale */
696                   "and    r11, r12, r5, lsr #8       \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */
697                   "and    r4, r12, r5                \n\t" /* rb = r4 = r5 masked by r12 */
698                   "mul    r11, r11, %[alpha]         \n\t" /* ag = r11 times src_scale */
699                   "mul    r4, r4, %[alpha]           \n\t" /* rb = r4 times src_scale */
700                   "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
701                   "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
702                   "orr    r5, r11, r4                \n\t" /* r5 = (src1, src_scale) */
703 
704                   /* dst1, dst1_scale */
705                   "and    r11, r12, r7, lsr #8       \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */
706                   "and    r4, r12, r7                \n\t" /* rb = r4 = r7 masked by r12 */
707                   "mul    r11, r11, r9               \n\t" /* ag = r11 times dst_scale (r9) */
708                   "mul    r4, r4, r9                 \n\t" /* rb = r4 times dst_scale (r9) */
709                   "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
710                   "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
711                   "orr    r9, r11, r4                \n\t" /* r9 = (dst1, dst_scale) */
712 
713                   /* ---------------------- */
714                   "add    r9, r5, r9                 \n\t" /* *dst = src plus dst both scaled */
715                   /* ---------------------- */
716 
717                   /* ====================== */
718 
719                   /* src2, src2_scale */
720                   "and    r11, r12, r6, lsr #8       \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */
721                   "and    r4, r12, r6                \n\t" /* rb = r4 = r6 masked by r12 */
722                   "mul    r11, r11, %[alpha]         \n\t" /* ag = r11 times src_scale */
723                   "mul    r4, r4, %[alpha]           \n\t" /* rb = r4 times src_scale */
724                   "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
725                   "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
726                   "orr    r6, r11, r4                \n\t" /* r6 = (src2, src_scale) */
727 
728                   /* dst2, dst2_scale */
729                   "and    r11, r12, r8, lsr #8       \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */
730                   "and    r4, r12, r8                \n\t" /* rb = r4 = r8 masked by r12 */
731                   "mul    r11, r11, r10              \n\t" /* ag = r11 times dst_scale (r10) */
732                   "mul    r4, r4, r10                \n\t" /* rb = r4 times dst_scale (r6) */
733                   "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
734                   "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
735                   "orr    r10, r11, r4               \n\t" /* r10 = (dst2, dst_scale) */
736 
737                   "sub    %[count], %[count], #2     \n\t" /* decrease count by 2 */
738                   /* ---------------------- */
739                   "add    r10, r6, r10               \n\t" /* *dst = src plus dst both scaled */
740                   /* ---------------------- */
741                   "cmp    %[count], #1               \n\t" /* compare count with 1 */
742                   /* ----------------- */
743                   "stm    %[dst]!, {r9, r10}         \n\t" /* copy r9 and r10 to r7 and r8 respectively */
744                   /* ----------------- */
745 
746                   "bgt    1b                         \n\t" /* if %[count] greater than 1 reloop */
747                   "blt    3f                         \n\t" /* if %[count] less than 1 exit */
748                                                            /* else get into the single loop */
749                   /* Single Loop */
750                   "2:                                \n\t" /* <single loop> */
751                   "ldr    r5, [%[src]], #4           \n\t" /* loading src pointer into r5: r5=src */
752                   "ldr    r7, [%[dst]]               \n\t" /* loading dst pointer into r7: r7=dst */
753 
754                   "lsr    r6, r5, #24                \n\t" /* src >> 24 */
755                   "and    r8, r12, r5, lsr #8        \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */
756                   "smulbb r6, r6, %[alpha]           \n\t" /* r6 = SkMulS16 with src_scale */
757                   "and    r9, r12, r5                \n\t" /* rb = r9 = r5 masked by r12 */
758                   "lsr    r6, r6, #8                 \n\t" /* r6 >> 8 */
759                   "mul    r8, r8, %[alpha]           \n\t" /* ag = r8 times scale */
760                   "rsb    r6, r6, #256               \n\t" /* r6 = 255 - r6 + 1 */
761 
762                   /* src, src_scale */
763                   "mul    r9, r9, %[alpha]           \n\t" /* rb = r9 times scale */
764                   "and    r8, r8, r12, lsl #8        \n\t" /* ag masked by reverse mask (r12) */
765                   "and    r9, r12, r9, lsr #8        \n\t" /* rb masked by mask (r12) */
766                   "orr    r10, r8, r9                \n\t" /* r10 = (scr, src_scale) */
767 
768                   /* dst, dst_scale */
769                   "and    r8, r12, r7, lsr #8        \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */
770                   "and    r9, r12, r7                \n\t" /* rb = r9 = r7 masked by r12 */
771                   "mul    r8, r8, r6                 \n\t" /* ag = r8 times scale (r6) */
772                   "mul    r9, r9, r6                 \n\t" /* rb = r9 times scale (r6) */
773                   "and    r8, r8, r12, lsl #8        \n\t" /* ag masked by reverse mask (r12) */
774                   "and    r9, r12, r9, lsr #8        \n\t" /* rb masked by mask (r12) */
775                   "orr    r7, r8, r9                 \n\t" /* r7 = (dst, dst_scale) */
776 
777                   "add    r10, r7, r10               \n\t" /* *dst = src plus dst both scaled */
778 
779                   /* ----------------- */
780                   "str    r10, [%[dst]], #4          \n\t" /* *dst = r10, postincrement dst by one (times 4) */
781                   /* ----------------- */
782 
783                   "3:                                \n\t" /* <exit> */
784                   : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha)
785                   :
786                   : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory"
787                   );
788 
789 }
790 #define	S32A_Blend_BlitRow32_PROC	S32A_Blend_BlitRow32_arm
791 
792 /* Neon version of S32_Blend_BlitRow32()
793  * portable version is in src/core/SkBlitRow_D32.cpp
794  */
795 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
S32_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)796 static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
797                                 const SkPMColor* SK_RESTRICT src,
798                                 int count, U8CPU alpha) {
799     SkASSERT(alpha <= 255);
800     if (count > 0) {
801         uint16_t src_scale = SkAlpha255To256(alpha);
802         uint16_t dst_scale = 256 - src_scale;
803 
804 	/* run them N at a time through the NEON unit */
805 	/* note that each 1 is 4 bytes, each treated exactly the same,
806 	 * so we can work under that guise. We *do* know that the src&dst
807 	 * will be 32-bit aligned quantities, so we can specify that on
808 	 * the load/store ops and do a neon 'reinterpret' to get us to
809 	 * byte-sized (pun intended) pieces that we widen/multiply/shift
810 	 * we're limited at 128 bits in the wide ops, which is 8x16bits
811 	 * or a pair of 32 bit src/dsts.
812 	 */
813 	/* we *could* manually unroll this loop so that we load 128 bits
814 	 * (as a pair of 64s) from each of src and dst, processing them
815 	 * in pieces. This might give us a little better management of
816 	 * the memory latency, but my initial attempts here did not
817 	 * produce an instruction stream that looked all that nice.
818 	 */
819 #define	UNROLL	2
820 	while (count >= UNROLL) {
821 	    uint8x8_t  src_raw, dst_raw, dst_final;
822 	    uint16x8_t  src_wide, dst_wide;
823 
824 	    /* get 64 bits of src, widen it, multiply by src_scale */
825 	    src_raw = vreinterpret_u8_u32(vld1_u32(src));
826 	    src_wide = vmovl_u8(src_raw);
827 	    /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
828 	    src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
829 
830 	    /* ditto with dst */
831 	    dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
832 	    dst_wide = vmovl_u8(dst_raw);
833 
834 	    /* combine add with dst multiply into mul-accumulate */
835 	    dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
836 
837 	    dst_final = vshrn_n_u16(dst_wide, 8);
838 	    vst1_u32(dst, vreinterpret_u32_u8(dst_final));
839 
840 	    src += UNROLL;
841 	    dst += UNROLL;
842 	    count -= UNROLL;
843 	}
844 	/* RBE: well, i don't like how gcc manages src/dst across the above
845 	 * loop it's constantly calculating src+bias, dst+bias and it only
846 	 * adjusts the real ones when we leave the loop. Not sure why
847 	 * it's "hoisting down" (hoisting implies above in my lexicon ;))
848 	 * the adjustments to src/dst/count, but it does...
849 	 * (might be SSA-style internal logic...
850 	 */
851 
852 #if	UNROLL == 2
853 	if (count == 1) {
854             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
855 	}
856 #else
857 	if (count > 0) {
858             do {
859                 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
860                 src += 1;
861                 dst += 1;
862             } while (--count > 0);
863 	}
864 #endif
865 
866 #undef	UNROLL
867     }
868 }
869 
870 #define	S32_Blend_BlitRow32_PROC	S32_Blend_BlitRow32_neon
871 #else
872 #define	S32_Blend_BlitRow32_PROC	NULL
873 #endif
874 
875 ///////////////////////////////////////////////////////////////////////////////
876 
877 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
878 
879 #undef	DEBUG_OPAQUE_DITHER
880 
881 #if	defined(DEBUG_OPAQUE_DITHER)
showme8(char * str,void * p,int len)882 static void showme8(char *str, void *p, int len)
883 {
884 	static char buf[256];
885 	char tbuf[32];
886 	int i;
887 	char *pc = (char*) p;
888 	sprintf(buf,"%8s:", str);
889 	for(i=0;i<len;i++) {
890 	    sprintf(tbuf, "   %02x", pc[i]);
891 	    strcat(buf, tbuf);
892 	}
893 	SkDebugf("%s\n", buf);
894 }
showme16(char * str,void * p,int len)895 static void showme16(char *str, void *p, int len)
896 {
897 	static char buf[256];
898 	char tbuf[32];
899 	int i;
900 	uint16_t *pc = (uint16_t*) p;
901 	sprintf(buf,"%8s:", str);
902 	len = (len / sizeof(uint16_t));	/* passed as bytes */
903 	for(i=0;i<len;i++) {
904 	    sprintf(tbuf, " %04x", pc[i]);
905 	    strcat(buf, tbuf);
906 	}
907 	SkDebugf("%s\n", buf);
908 }
909 #endif
910 
S32A_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)911 static void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
912                                       const SkPMColor* SK_RESTRICT src,
913                                       int count, U8CPU alpha, int x, int y) {
914     SkASSERT(255 == alpha);
915 
916 #define	UNROLL	8
917 
918     if (count >= UNROLL) {
919 	uint8x8_t dbase;
920 
921 #if	defined(DEBUG_OPAQUE_DITHER)
922 	uint16_t tmpbuf[UNROLL];
923 	int td[UNROLL];
924 	int tdv[UNROLL];
925 	int ta[UNROLL];
926 	int tap[UNROLL];
927 	uint16_t in_dst[UNROLL];
928 	int offset = 0;
929 	int noisy = 0;
930 #endif
931 
932 	const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
933 	dbase = vld1_u8(dstart);
934 
935         do {
936 	    uint8x8_t sr, sg, sb, sa, d;
937 	    uint16x8_t dst8, scale8, alpha8;
938 	    uint16x8_t dst_r, dst_g, dst_b;
939 
940 #if	defined(DEBUG_OPAQUE_DITHER)
941 	/* calculate 8 elements worth into a temp buffer */
942 	{
943 	  int my_y = y;
944 	  int my_x = x;
945 	  SkPMColor* my_src = (SkPMColor*)src;
946 	  uint16_t* my_dst = dst;
947 	  int i;
948 
949           DITHER_565_SCAN(my_y);
950           for(i=0;i<UNROLL;i++) {
951             SkPMColor c = *my_src++;
952             SkPMColorAssert(c);
953             if (c) {
954                 unsigned a = SkGetPackedA32(c);
955 
956                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
957 		tdv[i] = DITHER_VALUE(my_x);
958 		ta[i] = a;
959 		tap[i] = SkAlpha255To256(a);
960 		td[i] = d;
961 
962                 unsigned sr = SkGetPackedR32(c);
963                 unsigned sg = SkGetPackedG32(c);
964                 unsigned sb = SkGetPackedB32(c);
965                 sr = SkDITHER_R32_FOR_565(sr, d);
966                 sg = SkDITHER_G32_FOR_565(sg, d);
967                 sb = SkDITHER_B32_FOR_565(sb, d);
968 
969                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
970                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
971                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
972                 // now src and dst expanded are in g:11 r:10 x:1 b:10
973                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
974 		td[i] = d;
975 
976             } else {
977 		tmpbuf[i] = *my_dst;
978 		ta[i] = tdv[i] = td[i] = 0xbeef;
979 	    }
980 	    in_dst[i] = *my_dst;
981             my_dst += 1;
982             DITHER_INC_X(my_x);
983           }
984 	}
985 #endif
986 
987 	    /* source is in ABGR */
988 	    {
989 		register uint8x8_t d0 asm("d0");
990 		register uint8x8_t d1 asm("d1");
991 		register uint8x8_t d2 asm("d2");
992 		register uint8x8_t d3 asm("d3");
993 
994 		asm ("vld4.8	{d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
995 		    : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
996 		    : "r" (src)
997                     );
998 		    sr = d0; sg = d1; sb = d2; sa = d3;
999 	    }
1000 
1001 	    /* calculate 'd', which will be 0..7 */
1002 	    /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
1003 #if ANDROID
1004 	    /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1005 	    alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
1006 #else
1007 	    alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
1008 #endif
1009 	    alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
1010 	    d = vshrn_n_u16(alpha8, 8);	/* narrowing too */
1011 
1012 	    /* sr = sr - (sr>>5) + d */
1013 	    /* watching for 8-bit overflow.  d is 0..7; risky range of
1014 	     * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1015 	     * safe  as long as we do ((sr-sr>>5) + d) */
1016 	    sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1017 	    sr = vadd_u8(sr, d);
1018 
1019 	    /* sb = sb - (sb>>5) + d */
1020 	    sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1021 	    sb = vadd_u8(sb, d);
1022 
1023 	    /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1024 	    sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1025 	    sg = vadd_u8(sg, vshr_n_u8(d,1));
1026 
1027 	    /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
1028 	    dst8 = vld1q_u16(dst);
1029 	    dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
1030 	    dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
1031 	    dst_r = vshrq_n_u16(dst8,11);	/* clearing hi bits */
1032 
1033 	    /* blend */
1034 #if 1
1035 	    /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1036 	    /* originally 255-sa + 1 */
1037 	    scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1038 #else
1039 	    scale8 = vsubw_u8(vdupq_n_u16(255), sa);
1040 	    scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
1041 #endif
1042 
1043 #if 1
1044 	    /* combine the addq and mul, save 3 insns */
1045 	    scale8 = vshrq_n_u16(scale8, 3);
1046 	    dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1047 	    dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1048 	    dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1049 #else
1050 	    /* known correct, but +3 insns over above */
1051 	    scale8 = vshrq_n_u16(scale8, 3);
1052 	    dst_b = vmulq_u16(dst_b, scale8);
1053 	    dst_g = vmulq_u16(dst_g, scale8);
1054 	    dst_r = vmulq_u16(dst_r, scale8);
1055 
1056 	    /* combine */
1057 	    /* NB: vshll widens, need to preserve those bits */
1058 	    dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
1059 	    dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
1060 	    dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
1061 #endif
1062 
1063 	    /* repack to store */
1064 	    dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
1065 	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1066 	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1067 
1068 	    vst1q_u16(dst, dst8);
1069 
1070 #if	defined(DEBUG_OPAQUE_DITHER)
1071 	    /* verify my 8 elements match the temp buffer */
1072 	{
1073 	   int i, bad=0;
1074 	   static int invocation;
1075 
1076 	   for (i=0;i<UNROLL;i++)
1077 		if (tmpbuf[i] != dst[i]) bad=1;
1078 	   if (bad) {
1079 		SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1080 			invocation, offset);
1081 		SkDebugf("  alpha 0x%x\n", alpha);
1082 		for (i=0;i<UNROLL;i++)
1083 		    SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1084 			i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
1085 			dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
1086 
1087 		showme16("alpha8", &alpha8, sizeof(alpha8));
1088 		showme16("scale8", &scale8, sizeof(scale8));
1089 		showme8("d", &d, sizeof(d));
1090 		showme16("dst8", &dst8, sizeof(dst8));
1091 		showme16("dst_b", &dst_b, sizeof(dst_b));
1092 		showme16("dst_g", &dst_g, sizeof(dst_g));
1093 		showme16("dst_r", &dst_r, sizeof(dst_r));
1094 		showme8("sb", &sb, sizeof(sb));
1095 		showme8("sg", &sg, sizeof(sg));
1096 		showme8("sr", &sr, sizeof(sr));
1097 
1098 		/* cop out */
1099 		return;
1100 	   }
1101 	   offset += UNROLL;
1102 	   invocation++;
1103 	}
1104 #endif
1105 
1106             dst += UNROLL;
1107 	    src += UNROLL;
1108 	    count -= UNROLL;
1109 	    /* skip x += UNROLL, since it's unchanged mod-4 */
1110         } while (count >= UNROLL);
1111     }
1112 #undef	UNROLL
1113 
1114     /* residuals */
1115     if (count > 0) {
1116         DITHER_565_SCAN(y);
1117         do {
1118             SkPMColor c = *src++;
1119             SkPMColorAssert(c);
1120             if (c) {
1121                 unsigned a = SkGetPackedA32(c);
1122 
1123                 // dither and alpha are just temporary variables to work-around
1124                 // an ICE in debug.
1125                 unsigned dither = DITHER_VALUE(x);
1126                 unsigned alpha = SkAlpha255To256(a);
1127                 int d = SkAlphaMul(dither, alpha);
1128 
1129                 unsigned sr = SkGetPackedR32(c);
1130                 unsigned sg = SkGetPackedG32(c);
1131                 unsigned sb = SkGetPackedB32(c);
1132                 sr = SkDITHER_R32_FOR_565(sr, d);
1133                 sg = SkDITHER_G32_FOR_565(sg, d);
1134                 sb = SkDITHER_B32_FOR_565(sb, d);
1135 
1136                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1137                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1138                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1139                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1140                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1141             }
1142             dst += 1;
1143             DITHER_INC_X(x);
1144         } while (--count != 0);
1145     }
1146 }
1147 
1148 #define	S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon
1149 #else
1150 #define	S32A_D565_Opaque_Dither_PROC NULL
1151 #endif
1152 
1153 ///////////////////////////////////////////////////////////////////////////////
1154 
1155 #if	defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
1156 /* 2009/10/27: RBE says "a work in progress"; debugging says ok;
1157  * speedup untested, but ARM version is 26 insns/iteration and
1158  * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
1159  * which is 10x the native version; that's pure instruction counts,
1160  * not accounting for any instruction or memory latencies.
1161  */
1162 
1163 #undef	DEBUG_S32_OPAQUE_DITHER
1164 
S32_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1165 static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1166                                      const SkPMColor* SK_RESTRICT src,
1167                                      int count, U8CPU alpha, int x, int y) {
1168     SkASSERT(255 == alpha);
1169 
1170 #define	UNROLL	8
1171     if (count >= UNROLL) {
1172 	uint8x8_t d;
1173 	const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1174 	d = vld1_u8(dstart);
1175 
1176 	while (count >= UNROLL) {
1177 	    uint8x8_t sr, sg, sb, sa;
1178 	    uint16x8_t dr, dg, db, da;
1179 	    uint16x8_t dst8;
1180 
1181 	    /* source is in ABGR ordering (R == lsb) */
1182 	    {
1183 		register uint8x8_t d0 asm("d0");
1184 		register uint8x8_t d1 asm("d1");
1185 		register uint8x8_t d2 asm("d2");
1186 		register uint8x8_t d3 asm("d3");
1187 
1188 		asm ("vld4.8	{d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1189 		    : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1190 		    : "r" (src)
1191                     );
1192 		    sr = d0; sg = d1; sb = d2; sa = d3;
1193 	    }
1194 	    /* XXX: if we want to prefetch, hide it in the above asm()
1195 	     * using the gcc __builtin_prefetch(), the prefetch will
1196 	     * fall to the bottom of the loop -- it won't stick up
1197 	     * at the top of the loop, just after the vld4.
1198 	     */
1199 
1200 	    /* sr = sr - (sr>>5) + d */
1201 	    sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1202 	    dr = vaddl_u8(sr, d);
1203 
1204 	    /* sb = sb - (sb>>5) + d */
1205 	    sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1206 	    db = vaddl_u8(sb, d);
1207 
1208 	    /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1209 	    sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1210 	    dg = vaddl_u8(sg, vshr_n_u8(d,1));
1211 	    /* XXX: check that the "d>>1" here is hoisted */
1212 
1213 	    /* pack high bits of each into 565 format  (rgb, b is lsb) */
1214 	    dst8 = vshrq_n_u16(db, 3);
1215 	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1216 	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
1217 
1218 	    /* store it */
1219 	    vst1q_u16(dst, dst8);
1220 
1221 #if	defined(DEBUG_S32_OPAQUE_DITHER)
1222 	    /* always good to know if we generated good results */
1223 	    {
1224 		int i, myx = x, myy = y;
1225 		DITHER_565_SCAN(myy);
1226 		for (i=0;i<UNROLL;i++) {
1227 		    SkPMColor c = src[i];
1228 		    unsigned dither = DITHER_VALUE(myx);
1229 		    uint16_t val = SkDitherRGB32To565(c, dither);
1230 		    if (val != dst[i]) {
1231 			SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1232 			    c, dither, val, dst[i], dstart[i]);
1233 		    }
1234 		    DITHER_INC_X(myx);
1235 		}
1236 	    }
1237 #endif
1238 
1239 	    dst += UNROLL;
1240 	    src += UNROLL;
1241 	    count -= UNROLL;
1242 	    x += UNROLL;		/* probably superfluous */
1243 	}
1244     }
1245 #undef	UNROLL
1246 
1247     /* residuals */
1248     if (count > 0) {
1249         DITHER_565_SCAN(y);
1250         do {
1251             SkPMColor c = *src++;
1252             SkPMColorAssert(c);
1253             SkASSERT(SkGetPackedA32(c) == 255);
1254 
1255             unsigned dither = DITHER_VALUE(x);
1256             *dst++ = SkDitherRGB32To565(c, dither);
1257             DITHER_INC_X(x);
1258         } while (--count != 0);
1259     }
1260 }
1261 
1262 #define	S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon
1263 #else
1264 #define	S32_D565_Opaque_Dither_PROC NULL
1265 #endif
1266 
1267 ///////////////////////////////////////////////////////////////////////////////
1268 
1269 static const SkBlitRow::Proc platform_565_procs[] = {
1270     // no dither
1271     S32_D565_Opaque_PROC,
1272     S32_D565_Blend_PROC,
1273     S32A_D565_Opaque_PROC,
1274     S32A_D565_Blend_PROC,
1275 
1276     // dither
1277     S32_D565_Opaque_Dither_PROC,
1278     S32_D565_Blend_Dither_PROC,
1279     S32A_D565_Opaque_Dither_PROC,
1280     NULL,   // S32A_D565_Blend_Dither
1281 };
1282 
1283 static const SkBlitRow::Proc platform_4444_procs[] = {
1284     // no dither
1285     NULL,   // S32_D4444_Opaque,
1286     NULL,   // S32_D4444_Blend,
1287     NULL,   // S32A_D4444_Opaque,
1288     NULL,   // S32A_D4444_Blend,
1289 
1290     // dither
1291     NULL,   // S32_D4444_Opaque_Dither,
1292     NULL,   // S32_D4444_Blend_Dither,
1293     NULL,   // S32A_D4444_Opaque_Dither,
1294     NULL,   // S32A_D4444_Blend_Dither
1295 };
1296 
1297 static const SkBlitRow::Proc32 platform_32_procs[] = {
1298     NULL,   // S32_Opaque,
1299     S32_Blend_BlitRow32_PROC,		// S32_Blend,
1300     S32A_Opaque_BlitRow32_PROC,		// S32A_Opaque,
1301     S32A_Blend_BlitRow32_PROC		// S32A_Blend
1302 };
1303 
PlatformProcs4444(unsigned flags)1304 SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) {
1305     return platform_4444_procs[flags];
1306 }
1307 
PlatformProcs565(unsigned flags)1308 SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
1309     return platform_565_procs[flags];
1310 }
1311 
PlatformProcs32(unsigned flags)1312 SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
1313     return platform_32_procs[flags];
1314 }
1315 
PlatformColorProc()1316 SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
1317     return NULL;
1318 }
1319 
1320 
PlatformProcs(SkBitmap::Config dstConfig,SkColor color)1321 SkBlitMask::Proc SkBlitMask::PlatformProcs(SkBitmap::Config dstConfig,
1322                                            SkColor color)
1323 {
1324    return NULL;
1325 }
1326