• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2009 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 
9 #include "SkBlitRow.h"
10 #include "SkBlitMask.h"
11 #include "SkColorPriv.h"
12 #include "SkDither.h"
13 
14 #if defined(__ARM_HAVE_NEON)
15 #include <arm_neon.h>
16 #endif
17 
18 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
S32A_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)19 static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
20                                   const SkPMColor* SK_RESTRICT src, int count,
21                                   U8CPU alpha, int /*x*/, int /*y*/) {
22     SkASSERT(255 == alpha);
23 
24     if (count >= 8) {
25         uint16_t* SK_RESTRICT keep_dst;
26 
27         asm volatile (
28                       "ands       ip, %[count], #7            \n\t"
29                       "vmov.u8    d31, #1<<7                  \n\t"
30                       "vld1.16    {q12}, [%[dst]]             \n\t"
31                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
32                       "moveq      ip, #8                      \n\t"
33                       "mov        %[keep_dst], %[dst]         \n\t"
34 
35                       "add        %[src], %[src], ip, LSL#2   \n\t"
36                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
37                       "subs       %[count], %[count], ip      \n\t"
38                       "b          9f                          \n\t"
39                       // LOOP
40                       "2:                                         \n\t"
41 
42                       "vld1.16    {q12}, [%[dst]]!            \n\t"
43                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
44                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
45                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
46                       "subs       %[count], %[count], #8      \n\t"
47                       "9:                                         \n\t"
48                       "pld        [%[dst],#32]                \n\t"
49                       // expand 0565 q12 to 8888 {d4-d7}
50                       "vmovn.u16  d4, q12                     \n\t"
51                       "vshr.u16   q11, q12, #5                \n\t"
52                       "vshr.u16   q10, q12, #6+5              \n\t"
53                       "vmovn.u16  d5, q11                     \n\t"
54                       "vmovn.u16  d6, q10                     \n\t"
55                       "vshl.u8    d4, d4, #3                  \n\t"
56                       "vshl.u8    d5, d5, #2                  \n\t"
57                       "vshl.u8    d6, d6, #3                  \n\t"
58 
59                       "vmovl.u8   q14, d31                    \n\t"
60                       "vmovl.u8   q13, d31                    \n\t"
61                       "vmovl.u8   q12, d31                    \n\t"
62 
63                       // duplicate in 4/2/1 & 8pix vsns
64                       "vmvn.8     d30, d3                     \n\t"
65                       "vmlal.u8   q14, d30, d6                \n\t"
66                       "vmlal.u8   q13, d30, d5                \n\t"
67                       "vmlal.u8   q12, d30, d4                \n\t"
68                       "vshr.u16   q8, q14, #5                 \n\t"
69                       "vshr.u16   q9, q13, #6                 \n\t"
70                       "vaddhn.u16 d6, q14, q8                 \n\t"
71                       "vshr.u16   q8, q12, #5                 \n\t"
72                       "vaddhn.u16 d5, q13, q9                 \n\t"
73                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
74                       "vaddhn.u16 d4, q12, q8                 \n\t"
75                       // intentionally don't calculate alpha
76                       // result in d4-d6
77 
78                       "vqadd.u8   d5, d5, d1                  \n\t"
79                       "vqadd.u8   d4, d4, d2                  \n\t"
80 
81                       // pack 8888 {d4-d6} to 0565 q10
82                       "vshll.u8   q10, d6, #8                 \n\t"
83                       "vshll.u8   q3, d5, #8                  \n\t"
84                       "vshll.u8   q2, d4, #8                  \n\t"
85                       "vsri.u16   q10, q3, #5                 \n\t"
86                       "vsri.u16   q10, q2, #11                \n\t"
87 
88                       "bne        2b                          \n\t"
89 
90                       "1:                                         \n\t"
91                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
92                       : [count] "+r" (count)
93                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
94                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
95                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
96                       "d30","d31"
97                       );
98     }
99     else
100     {   // handle count < 8
101         uint16_t* SK_RESTRICT keep_dst;
102 
103         asm volatile (
104                       "vmov.u8    d31, #1<<7                  \n\t"
105                       "mov        %[keep_dst], %[dst]         \n\t"
106 
107                       "tst        %[count], #4                \n\t"
108                       "beq        14f                         \n\t"
109                       "vld1.16    {d25}, [%[dst]]!            \n\t"
110                       "vld1.32    {q1}, [%[src]]!             \n\t"
111 
112                       "14:                                        \n\t"
113                       "tst        %[count], #2                \n\t"
114                       "beq        12f                         \n\t"
115                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
116                       "vld1.32    {d1}, [%[src]]!             \n\t"
117 
118                       "12:                                        \n\t"
119                       "tst        %[count], #1                \n\t"
120                       "beq        11f                         \n\t"
121                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
122                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
123 
124                       "11:                                        \n\t"
125                       // unzips achieve the same as a vld4 operation
126                       "vuzpq.u16  q0, q1                      \n\t"
127                       "vuzp.u8    d0, d1                      \n\t"
128                       "vuzp.u8    d2, d3                      \n\t"
129                       // expand 0565 q12 to 8888 {d4-d7}
130                       "vmovn.u16  d4, q12                     \n\t"
131                       "vshr.u16   q11, q12, #5                \n\t"
132                       "vshr.u16   q10, q12, #6+5              \n\t"
133                       "vmovn.u16  d5, q11                     \n\t"
134                       "vmovn.u16  d6, q10                     \n\t"
135                       "vshl.u8    d4, d4, #3                  \n\t"
136                       "vshl.u8    d5, d5, #2                  \n\t"
137                       "vshl.u8    d6, d6, #3                  \n\t"
138 
139                       "vmovl.u8   q14, d31                    \n\t"
140                       "vmovl.u8   q13, d31                    \n\t"
141                       "vmovl.u8   q12, d31                    \n\t"
142 
143                       // duplicate in 4/2/1 & 8pix vsns
144                       "vmvn.8     d30, d3                     \n\t"
145                       "vmlal.u8   q14, d30, d6                \n\t"
146                       "vmlal.u8   q13, d30, d5                \n\t"
147                       "vmlal.u8   q12, d30, d4                \n\t"
148                       "vshr.u16   q8, q14, #5                 \n\t"
149                       "vshr.u16   q9, q13, #6                 \n\t"
150                       "vaddhn.u16 d6, q14, q8                 \n\t"
151                       "vshr.u16   q8, q12, #5                 \n\t"
152                       "vaddhn.u16 d5, q13, q9                 \n\t"
153                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
154                       "vaddhn.u16 d4, q12, q8                 \n\t"
155                       // intentionally don't calculate alpha
156                       // result in d4-d6
157 
158                       "vqadd.u8   d5, d5, d1                  \n\t"
159                       "vqadd.u8   d4, d4, d2                  \n\t"
160 
161                       // pack 8888 {d4-d6} to 0565 q10
162                       "vshll.u8   q10, d6, #8                 \n\t"
163                       "vshll.u8   q3, d5, #8                  \n\t"
164                       "vshll.u8   q2, d4, #8                  \n\t"
165                       "vsri.u16   q10, q3, #5                 \n\t"
166                       "vsri.u16   q10, q2, #11                \n\t"
167 
168                       // store
169                       "tst        %[count], #4                \n\t"
170                       "beq        24f                         \n\t"
171                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
172 
173                       "24:                                        \n\t"
174                       "tst        %[count], #2                \n\t"
175                       "beq        22f                         \n\t"
176                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
177 
178                       "22:                                        \n\t"
179                       "tst        %[count], #1                \n\t"
180                       "beq        21f                         \n\t"
181                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
182 
183                       "21:                                        \n\t"
184                       : [count] "+r" (count)
185                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
186                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
187                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
188                       "d30","d31"
189                       );
190     }
191 }
192 
S32A_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)193 static void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
194                                  const SkPMColor* SK_RESTRICT src, int count,
195                                  U8CPU alpha, int /*x*/, int /*y*/) {
196 
197     U8CPU alpha_for_asm = alpha;
198 
199     asm volatile (
200     /* This code implements a Neon version of S32A_D565_Blend. The output differs from
201      * the original in two respects:
202      *  1. The results have a few mismatches compared to the original code. These mismatches
203      *     never exceed 1. It's possible to improve accuracy vs. a floating point
204      *     implementation by introducing rounding right shifts (vrshr) for the final stage.
205      *     Rounding is not present in the code below, because although results would be closer
206      *     to a floating point implementation, the number of mismatches compared to the
207      *     original code would be far greater.
208      *  2. On certain inputs, the original code can overflow, causing colour channels to
209      *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
210      *     to affect another.
211      */
212 
213 #if 1
214 		/* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
215                   "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
216 #else
217                   "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
218 #endif
219                   "vmov.u16   q3, #255                        \n\t"   // set up constant
220                   "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
221                   "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
222                   "beq        2f                              \n\t"   // if count8 == 0, exit
223                   "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
224 
225                   "1:                                             \n\t"
226                   "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
227                   "subs       r4, r4, #1                      \n\t"   // decrement loop counter
228                   "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
229                   //  and deinterleave
230 
231                   "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
232                   "vand       q10, q0, q15                    \n\t"   // extract blue
233                   "vshr.u16   q8, q0, #11                     \n\t"   // extract red
234                   "vshr.u16   q9, q9, #10                     \n\t"   // extract green
235                   // dstrgb = {q8, q9, q10}
236 
237                   "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
238                   "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
239                   "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
240 
241                   "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
242                   "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
243                   "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
244                   "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
245                   // srcrgba = {q11, q12, q13, q14}
246 
247                   "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
248                   "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
249                   "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
250                   "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
251 
252                   "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
253                   "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
254                   // dst_scale = q2
255 
256                   "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
257                   "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
258                   "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
259 
260 #if 1
261 	// trying for a better match with SkDiv255Round(a)
262 	// C alg is:  a+=128; (a+a>>8)>>8
263 	// we'll use just a rounding shift [q2 is available for scratch]
264                   "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
265                   "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
266                   "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
267 #else
268 	// arm's original "truncating divide by 256"
269                   "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
270                   "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
271                   "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
272 #endif
273 
274                   "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
275                   "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
276                   "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
277 
278                   "bne        1b                              \n\t"   // if counter != 0, loop
279                   "2:                                             \n\t"   // exit
280 
281                   : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
282                   :
283                   : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
284                   );
285 
286     count &= 7;
287     if (count > 0) {
288         do {
289             SkPMColor sc = *src++;
290             if (sc) {
291                 uint16_t dc = *dst;
292                 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
293                 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
294                 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
295                 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
296                 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
297             }
298             dst += 1;
299         } while (--count != 0);
300     }
301 }
302 
303 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
304  * each dither value is spaced out into byte lanes, and repeated
305  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
306  * start of each row.
307  */
308 static const uint8_t gDitherMatrix_Neon[48] = {
309     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
310     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
311     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
312     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
313 
314 };
315 
S32_D565_Blend_Dither_neon(uint16_t * dst,const SkPMColor * src,int count,U8CPU alpha,int x,int y)316 static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
317                                        int count, U8CPU alpha, int x, int y)
318 {
319     /* select row and offset for dither array */
320     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
321 
322     /* rescale alpha to range 0 - 256 */
323     int scale = SkAlpha255To256(alpha);
324 
325     asm volatile (
326                   "vld1.8         {d31}, [%[dstart]]              \n\t"   // load dither values
327                   "vshr.u8        d30, d31, #1                    \n\t"   // calc. green dither values
328                   "vdup.16        d6, %[scale]                    \n\t"   // duplicate scale into neon reg
329                   "vmov.i8        d29, #0x3f                      \n\t"   // set up green mask
330                   "vmov.i8        d28, #0x1f                      \n\t"   // set up blue mask
331                   "1:                                                 \n\t"
332                   "vld4.8         {d0, d1, d2, d3}, [%[src]]!     \n\t"   // load 8 pixels and split into argb
333                   "vshr.u8        d22, d0, #5                     \n\t"   // calc. red >> 5
334                   "vshr.u8        d23, d1, #6                     \n\t"   // calc. green >> 6
335                   "vshr.u8        d24, d2, #5                     \n\t"   // calc. blue >> 5
336                   "vaddl.u8       q8, d0, d31                     \n\t"   // add in dither to red and widen
337                   "vaddl.u8       q9, d1, d30                     \n\t"   // add in dither to green and widen
338                   "vaddl.u8       q10, d2, d31                    \n\t"   // add in dither to blue and widen
339                   "vsubw.u8       q8, q8, d22                     \n\t"   // sub shifted red from result
340                   "vsubw.u8       q9, q9, d23                     \n\t"   // sub shifted green from result
341                   "vsubw.u8       q10, q10, d24                   \n\t"   // sub shifted blue from result
342                   "vshrn.i16      d22, q8, #3                     \n\t"   // shift right and narrow to 5 bits
343                   "vshrn.i16      d23, q9, #2                     \n\t"   // shift right and narrow to 6 bits
344                   "vshrn.i16      d24, q10, #3                    \n\t"   // shift right and narrow to 5 bits
345                   // load 8 pixels from dst, extract rgb
346                   "vld1.16        {d0, d1}, [%[dst]]              \n\t"   // load 8 pixels
347                   "vshrn.i16      d17, q0, #5                     \n\t"   // shift green down to bottom 6 bits
348                   "vmovn.i16      d18, q0                         \n\t"   // narrow to get blue as bytes
349                   "vshr.u16       q0, q0, #11                     \n\t"   // shift down to extract red
350                   "vand           d17, d17, d29                   \n\t"   // and green with green mask
351                   "vand           d18, d18, d28                   \n\t"   // and blue with blue mask
352                   "vmovn.i16      d16, q0                         \n\t"   // narrow to get red as bytes
353                   // src = {d22 (r), d23 (g), d24 (b)}
354                   // dst = {d16 (r), d17 (g), d18 (b)}
355                   // subtract dst from src and widen
356                   "vsubl.s8       q0, d22, d16                    \n\t"   // subtract red src from dst
357                   "vsubl.s8       q1, d23, d17                    \n\t"   // subtract green src from dst
358                   "vsubl.s8       q2, d24, d18                    \n\t"   // subtract blue src from dst
359                   // multiply diffs by scale and shift
360                   "vmul.i16       q0, q0, d6[0]                   \n\t"   // multiply red by scale
361                   "vmul.i16       q1, q1, d6[0]                   \n\t"   // multiply blue by scale
362                   "vmul.i16       q2, q2, d6[0]                   \n\t"   // multiply green by scale
363                   "subs           %[count], %[count], #8          \n\t"   // decrement loop counter
364                   "vshrn.i16      d0, q0, #8                      \n\t"   // shift down red by 8 and narrow
365                   "vshrn.i16      d2, q1, #8                      \n\t"   // shift down green by 8 and narrow
366                   "vshrn.i16      d4, q2, #8                      \n\t"   // shift down blue by 8 and narrow
367                   // add dst to result
368                   "vaddl.s8       q0, d0, d16                     \n\t"   // add dst to red
369                   "vaddl.s8       q1, d2, d17                     \n\t"   // add dst to green
370                   "vaddl.s8       q2, d4, d18                     \n\t"   // add dst to blue
371                   // put result into 565 format
372                   "vsli.i16       q2, q1, #5                      \n\t"   // shift up green and insert into blue
373                   "vsli.i16       q2, q0, #11                     \n\t"   // shift up red and insert into blue
374                   "vst1.16        {d4, d5}, [%[dst]]!             \n\t"   // store result
375                   "bgt            1b                              \n\t"   // loop if count > 0
376                   : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
377                   : [dstart] "r" (dstart), [scale] "r" (scale)
378                   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
379                   );
380 
381     DITHER_565_SCAN(y);
382 
383     while((count & 7) > 0)
384     {
385         SkPMColor c = *src++;
386 
387         int dither = DITHER_VALUE(x);
388         int sr = SkGetPackedR32(c);
389         int sg = SkGetPackedG32(c);
390         int sb = SkGetPackedB32(c);
391         sr = SkDITHER_R32To565(sr, dither);
392         sg = SkDITHER_G32To565(sg, dither);
393         sb = SkDITHER_B32To565(sb, dither);
394 
395         uint16_t d = *dst;
396         *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
397                              SkAlphaBlend(sg, SkGetPackedG16(d), scale),
398                              SkAlphaBlend(sb, SkGetPackedB16(d), scale));
399         DITHER_INC_X(x);
400         count--;
401     }
402 }
403 
404 #define S32A_D565_Opaque_PROC       S32A_D565_Opaque_neon
405 #define S32A_D565_Blend_PROC        S32A_D565_Blend_neon
406 #define S32_D565_Blend_Dither_PROC  S32_D565_Blend_Dither_neon
407 #elif __ARM_ARCH__ >= 7 && !defined(SK_CPU_BENDIAN)
S32A_D565_Opaque_v7(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)408 static void S32A_D565_Opaque_v7(uint16_t* SK_RESTRICT dst,
409                                   const SkPMColor* SK_RESTRICT src, int count,
410                                   U8CPU alpha, int /*x*/, int /*y*/) {
411     SkASSERT(255 == alpha);
412 
413     asm volatile (
414                   "1:                                   \n\t"
415                   "ldr     r3, [%[src]], #4             \n\t"
416                   "cmp     r3, #0xff000000              \n\t"
417                   "blo     2f                           \n\t"
418                   "and     r4, r3, #0x0000f8            \n\t"
419                   "and     r5, r3, #0x00fc00            \n\t"
420                   "and     r6, r3, #0xf80000            \n\t"
421                   "pld     [r1, #32]                    \n\t"
422                   "lsl     r3, r4, #8                   \n\t"
423                   "orr     r3, r3, r5, lsr #5           \n\t"
424                   "orr     r3, r3, r6, lsr #19          \n\t"
425                   "subs    %[count], %[count], #1       \n\t"
426                   "strh    r3, [%[dst]], #2             \n\t"
427                   "bne     1b                           \n\t"
428                   "b       4f                           \n\t"
429                   "2:                                   \n\t"
430                   "lsrs    r7, r3, #24                  \n\t"
431                   "beq     3f                           \n\t"
432                   "ldrh    r4, [%[dst]]                 \n\t"
433                   "rsb     r7, r7, #255                 \n\t"
434                   "and     r6, r4, #0x001f              \n\t"
435                   "ubfx    r5, r4, #5, #6               \n\t"
436                   "pld     [r0, #16]                    \n\t"
437                   "lsr     r4, r4, #11                  \n\t"
438                   "smulbb  r6, r6, r7                   \n\t"
439                   "smulbb  r5, r5, r7                   \n\t"
440                   "smulbb  r4, r4, r7                   \n\t"
441                   "ubfx    r7, r3, #16, #8              \n\t"
442                   "ubfx    ip, r3, #8, #8               \n\t"
443                   "and     r3, r3, #0xff                \n\t"
444                   "add     r6, r6, #16                  \n\t"
445                   "add     r5, r5, #32                  \n\t"
446                   "add     r4, r4, #16                  \n\t"
447                   "add     r6, r6, r6, lsr #5           \n\t"
448                   "add     r5, r5, r5, lsr #6           \n\t"
449                   "add     r4, r4, r4, lsr #5           \n\t"
450                   "add     r6, r7, r6, lsr #5           \n\t"
451                   "add     r5, ip, r5, lsr #6           \n\t"
452                   "add     r4, r3, r4, lsr #5           \n\t"
453                   "lsr     r6, r6, #3                   \n\t"
454                   "and     r5, r5, #0xfc                \n\t"
455                   "and     r4, r4, #0xf8                \n\t"
456                   "orr     r6, r6, r5, lsl #3           \n\t"
457                   "orr     r4, r6, r4, lsl #8           \n\t"
458                   "strh    r4, [%[dst]], #2             \n\t"
459                   "pld     [r1, #32]                    \n\t"
460                   "subs    %[count], %[count], #1       \n\t"
461                   "bne     1b                           \n\t"
462                   "b       4f                           \n\t"
463                   "3:                                   \n\t"
464                   "subs    %[count], %[count], #1       \n\t"
465                   "add     %[dst], %[dst], #2           \n\t"
466                   "bne     1b                           \n\t"
467                   "4:                                   \n\t"
468                   : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
469                   :
470                   : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "ip"
471                   );
472 }
473 #define S32A_D565_Opaque_PROC       S32A_D565_Opaque_v7
474 #define S32A_D565_Blend_PROC        NULL
475 #define S32_D565_Blend_Dither_PROC  NULL
476 #else
477 #define S32A_D565_Opaque_PROC       NULL
478 #define S32A_D565_Blend_PROC        NULL
479 #define S32_D565_Blend_Dither_PROC  NULL
480 #endif
481 
482 /* Don't have a special version that assumes each src is opaque, but our S32A
483     is still faster than the default, so use it here
484  */
485 #define S32_D565_Opaque_PROC    S32A_D565_Opaque_PROC
486 #define S32_D565_Blend_PROC     S32A_D565_Blend_PROC
487 
488 ///////////////////////////////////////////////////////////////////////////////
489 
490 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) && defined(TEST_SRC_ALPHA)
491 
S32A_Opaque_BlitRow32_neon_test_alpha(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)492 static void S32A_Opaque_BlitRow32_neon_test_alpha(SkPMColor* SK_RESTRICT dst,
493                                   const SkPMColor* SK_RESTRICT src,
494                                   int count, U8CPU alpha) {
495 	SkASSERT(255 == alpha);
496 	if (count <= 0)
497 	return;
498 
499 	/* Use these to check if src is transparent or opaque */
500 	const unsigned int ALPHA_OPAQ  = 0xFF000000;
501 	const unsigned int ALPHA_TRANS = 0x00FFFFFF;
502 
503 #define UNROLL  4
504 	const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
505 	const SkPMColor* SK_RESTRICT src_temp = src;
506 
507 	/* set up the NEON variables */
508 	uint8x8_t alpha_mask;
509 	static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
510 	alpha_mask = vld1_u8(alpha_mask_setup);
511 
512 	uint8x8_t src_raw, dst_raw, dst_final;
513 	uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
514 	uint8x8_t dst_cooked;
515 	uint16x8_t dst_wide;
516 	uint8x8_t alpha_narrow;
517 	uint16x8_t alpha_wide;
518 
519 	/* choose the first processing type */
520 	if( src >= src_end)
521 		goto TAIL;
522 	if(*src <= ALPHA_TRANS)
523 		goto ALPHA_0;
524 	if(*src >= ALPHA_OPAQ)
525 		goto ALPHA_255;
526 	/* fall-thru */
527 
528 ALPHA_1_TO_254:
529 	do {
530 
531 		/* get the source */
532 		src_raw = vreinterpret_u8_u32(vld1_u32(src));
533 		src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
534 
535 		/* get and hold the dst too */
536 		dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
537 		dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
538 
539 
540 		/* get the alphas spread out properly */
541 		alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
542 		/* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
543 		/* we collapsed (255-a)+1 ... */
544 		alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
545 
546 		/* spread the dest */
547 		dst_wide = vmovl_u8(dst_raw);
548 
549 		/* alpha mul the dest */
550 		dst_wide = vmulq_u16 (dst_wide, alpha_wide);
551 		dst_cooked = vshrn_n_u16(dst_wide, 8);
552 
553 		/* sum -- ignoring any byte lane overflows */
554 		dst_final = vadd_u8(src_raw, dst_cooked);
555 
556 		alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
557 		/* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
558 		/* we collapsed (255-a)+1 ... */
559 		alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
560 
561 		/* spread the dest */
562 		dst_wide = vmovl_u8(dst_raw_2);
563 
564 		/* alpha mul the dest */
565 		dst_wide = vmulq_u16 (dst_wide, alpha_wide);
566 		dst_cooked = vshrn_n_u16(dst_wide, 8);
567 
568 		/* sum -- ignoring any byte lane overflows */
569 		dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
570 
571 		vst1_u32(dst, vreinterpret_u32_u8(dst_final));
572 		vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
573 
574 		src += UNROLL;
575 		dst += UNROLL;
576 
577 		/* if 2 of the next pixels aren't between 1 and 254
578 		it might make sense to go to the optimized loops */
579 		if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
580 			break;
581 
582 	} while(src < src_end);
583 
584 	if (src >= src_end)
585 		goto TAIL;
586 
587 	if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
588 		goto ALPHA_255;
589 
590 	/*fall-thru*/
591 
592 ALPHA_0:
593 
594 	/*In this state, we know the current alpha is 0 and
595 	 we optimize for the next alpha also being zero. */
596 	src_temp = src;  //so we don't have to increment dst every time
597 	do {
598 		if(*(++src) > ALPHA_TRANS)
599 			break;
600 		if(*(++src) > ALPHA_TRANS)
601 			break;
602 		if(*(++src) > ALPHA_TRANS)
603 			break;
604 		if(*(++src) > ALPHA_TRANS)
605 			break;
606 	} while(src < src_end);
607 
608 	dst += (src - src_temp);
609 
610 	/* no longer alpha 0, so determine where to go next. */
611 	if( src >= src_end)
612 		goto TAIL;
613 	if(*src >= ALPHA_OPAQ)
614 		goto ALPHA_255;
615 	else
616 		goto ALPHA_1_TO_254;
617 
618 ALPHA_255:
619 	while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
620 		dst[0]=src[0];
621 		dst[1]=src[1];
622 		dst[2]=src[2];
623 		dst[3]=src[3];
624 		src+=UNROLL;
625 		dst+=UNROLL;
626 		if(src >= src_end)
627 			goto TAIL;
628 	}
629 
630 	//Handle remainder.
631 	if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
632 		if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
633 			if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
634 		}
635 	}
636 
637 	if( src >= src_end)
638 		goto TAIL;
639 	if(*src <= ALPHA_TRANS)
640 		goto ALPHA_0;
641 	else
642 		goto ALPHA_1_TO_254;
643 
644 TAIL:
645 	/* do any residual iterations */
646 	src_end += UNROLL + 1;  //goto the real end
647 	while(src != src_end) {
648 		if( *src != 0 ) {
649 			if( *src >= ALPHA_OPAQ ) {
650 				*dst = *src;
651 			}
652 			else {
653 				*dst = SkPMSrcOver(*src, *dst);
654 			}
655 		}
656 		src++;
657 		dst++;
658 	}
659 	return;
660 }
661 
662 #define S32A_Opaque_BlitRow32_PROC  S32A_Opaque_BlitRow32_neon_test_alpha
663 
664 #elif defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
665 
S32A_Opaque_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)666 static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
667                                   const SkPMColor* SK_RESTRICT src,
668                                   int count, U8CPU alpha) {
669 
670     SkASSERT(255 == alpha);
671     if (count > 0) {
672 
673 
674 	uint8x8_t alpha_mask;
675 
676 	static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
677 	alpha_mask = vld1_u8(alpha_mask_setup);
678 
679 	/* do the NEON unrolled code */
680 #define	UNROLL	4
681 	while (count >= UNROLL) {
682 	    uint8x8_t src_raw, dst_raw, dst_final;
683 	    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
684 
685 	    /* get the source */
686 	    src_raw = vreinterpret_u8_u32(vld1_u32(src));
687 #if	UNROLL > 2
688 	    src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
689 #endif
690 
691 	    /* get and hold the dst too */
692 	    dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
693 #if	UNROLL > 2
694 	    dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
695 #endif
696 
697 	/* 1st and 2nd bits of the unrolling */
698 	{
699 	    uint8x8_t dst_cooked;
700 	    uint16x8_t dst_wide;
701 	    uint8x8_t alpha_narrow;
702 	    uint16x8_t alpha_wide;
703 
704 	    /* get the alphas spread out properly */
705 	    alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
706 #if 1
707 	    /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
708 	    /* we collapsed (255-a)+1 ... */
709 	    alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
710 #else
711 	    alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
712 	    alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
713 #endif
714 
715 	    /* spread the dest */
716 	    dst_wide = vmovl_u8(dst_raw);
717 
718 	    /* alpha mul the dest */
719 	    dst_wide = vmulq_u16 (dst_wide, alpha_wide);
720 	    dst_cooked = vshrn_n_u16(dst_wide, 8);
721 
722 	    /* sum -- ignoring any byte lane overflows */
723 	    dst_final = vadd_u8(src_raw, dst_cooked);
724 	}
725 
726 #if	UNROLL > 2
727 	/* the 3rd and 4th bits of our unrolling */
728 	{
729 	    uint8x8_t dst_cooked;
730 	    uint16x8_t dst_wide;
731 	    uint8x8_t alpha_narrow;
732 	    uint16x8_t alpha_wide;
733 
734 	    alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
735 #if 1
736 	    /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
737 	    /* we collapsed (255-a)+1 ... */
738 	    alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
739 #else
740 	    alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
741 	    alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
742 #endif
743 
744 	    /* spread the dest */
745 	    dst_wide = vmovl_u8(dst_raw_2);
746 
747 	    /* alpha mul the dest */
748 	    dst_wide = vmulq_u16 (dst_wide, alpha_wide);
749 	    dst_cooked = vshrn_n_u16(dst_wide, 8);
750 
751 	    /* sum -- ignoring any byte lane overflows */
752 	    dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
753 	}
754 #endif
755 
756 	    vst1_u32(dst, vreinterpret_u32_u8(dst_final));
757 #if	UNROLL > 2
758 	    vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
759 #endif
760 
761 	    src += UNROLL;
762 	    dst += UNROLL;
763 	    count -= UNROLL;
764 	}
765 #undef	UNROLL
766 
767 	/* do any residual iterations */
768         while (--count >= 0) {
769 #ifdef TEST_SRC_ALPHA
770             SkPMColor sc = *src;
771             if (sc) {
772                 unsigned srcA = SkGetPackedA32(sc);
773                 SkPMColor result = sc;
774                 if (srcA != 255) {
775                     result = SkPMSrcOver(sc, *dst);
776                 }
777                 *dst = result;
778             }
779 #else
780             *dst = SkPMSrcOver(*src, *dst);
781 #endif
782             src += 1;
783             dst += 1;
784         }
785     }
786 }
787 
788 #define	S32A_Opaque_BlitRow32_PROC	S32A_Opaque_BlitRow32_neon
789 
790 #elif defined (__ARM_ARCH__) /* #if defined(__ARM_HAVE_NEON) && defined... */
791 
792 #if defined(TEST_SRC_ALPHA)
793 
S32A_Opaque_BlitRow32_arm_test_alpha(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)794 static void __attribute__((naked)) S32A_Opaque_BlitRow32_arm_test_alpha
795                                         (SkPMColor* SK_RESTRICT dst,
796                                          const SkPMColor* SK_RESTRICT src,
797                                          int count, U8CPU alpha) {
798 
799 /* Optimizes for alpha == 0, alpha == 255, and 1 < alpha < 255 cases individually */
800 /* Predicts that the next pixel will have the same alpha type as the current pixel */
801 
802 asm volatile (
803 
804     "\tSTMDB  r13!, {r4-r12, r14}        \n" /* saving r4-r12, lr on the stack */
805                                              /* we should not save r0-r3 according to ABI */
806 
807     "\tCMP    r2, #0                     \n" /* if (count == 0) */
808     "\tBEQ    9f                         \n" /* go to EXIT */
809 
810     "\tMOV    r12, #0xff                 \n" /* load the 0xff mask in r12 */
811     "\tORR    r12, r12, r12, LSL #16     \n" /* convert it to 0xff00ff in r12 */
812 
813     "\tMOV    r14, #255                  \n" /* r14 = 255 */
814                                              /* will be used later for left-side comparison */
815 
816     "\tADD    r2, %[src], r2, LSL #2     \n" /* r2 points to last array element which can be used */
817     "\tSUB    r2, r2, #16                \n" /* as a base for 4-way processing algorithm */
818 
819     "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer is bigger than */
820     "\tBGT    8f                         \n" /* calculated marker for 4-way -> */
821                                              /* use simple one-by-one processing */
822 
823     /* START OF DISPATCHING BLOCK */
824 
825     "\t0:                                \n"
826 
827     "\tLDM    %[src]!, {r3, r4, r5, r6}  \n" /* 4-way loading of source values to r3-r6 */
828 
829     "\tLSR    r7, r3, #24                \n" /* if not all src alphas of 4-way block are equal -> */
830     "\tCMP    r7, r4, LSR #24            \n"
831     "\tCMPEQ  r7, r5, LSR #24            \n"
832     "\tCMPEQ  r7, r6, LSR #24            \n"
833     "\tBNE    1f                         \n" /* -> go to general 4-way processing routine */
834 
835     "\tCMP    r14, r7                    \n" /* if all src alphas are equal to 255 */
836     "\tBEQ    3f                         \n" /* go to alpha == 255 optimized routine */
837 
838     "\tCMP    r7,  #0                    \n" /* if all src alphas are equal to 0 */
839     "\tBEQ    6f                         \n" /* go to alpha == 0 optimized routine */
840 
841     /* END OF DISPATCHING BLOCK */
842 
843     /* START OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */
844 
845     "\t1:                                \n"
846                                              /* we do not have enough registers to make */
847                                              /* 4-way [dst] loading -> we are using 2 * 2-way */
848 
849     "\tLDM    %[dst], {r7, r8}           \n" /* 1st 2-way loading of dst values to r7-r8 */
850 
851     /* PROCESSING BLOCK 1 */
852     /* r3 = src, r7 = dst */
853 
854     "\tLSR    r11, r3,  #24              \n" /* extracting alpha from source and storing to r11 */
855     "\tAND    r9,  r12, r7               \n" /* r9 = br masked by r12 (0xff00ff) */
856     "\tRSB    r11, r11, #256             \n" /* subtracting the alpha from 255 -> r11 = scale */
857     "\tAND    r10, r12, r7, LSR #8       \n" /* r10 = ag masked by r12 (0xff00ff) */
858     "\tMUL    r9,  r9,  r11              \n" /* br = br * scale */
859     "\tAND    r9,  r12, r9, LSR #8       \n" /* lsr br by 8 and mask it */
860     "\tMUL    r10, r10, r11              \n" /* ag = ag * scale */
861     "\tAND    r10, r10, r12, LSL #8      \n" /* mask ag with reverse mask */
862     "\tORR    r7,  r9,  r10              \n" /* br | ag */
863     "\tADD    r7,  r3,  r7               \n" /* dst = src + calc dest(r8) */
864 
865     /* PROCESSING BLOCK 2 */
866     /* r4 = src, r8 = dst */
867 
868     "\tLSR    r11, r4,  #24              \n" /* see PROCESSING BLOCK 1 */
869     "\tAND    r9,  r12, r8               \n"
870     "\tRSB    r11, r11, #256             \n"
871     "\tAND    r10, r12, r8, LSR #8       \n"
872     "\tMUL    r9,  r9,  r11              \n"
873     "\tAND    r9,  r12, r9, LSR #8       \n"
874     "\tMUL    r10, r10, r11              \n"
875     "\tAND    r10, r10, r12, LSL #8      \n"
876     "\tORR    r8,  r9,  r10              \n"
877     "\tADD    r8,  r4,  r8               \n"
878 
879     "\tSTM    %[dst]!, {r7, r8}          \n" /* 1st 2-way storing of processed dst values */
880 
881     "\tLDM    %[dst], {r9, r10}          \n" /* 2nd 2-way loading of dst values to r9-r10 */
882 
883     /* PROCESSING BLOCK 3 */
884     /* r5 = src, r9 = dst */
885 
886     "\tLSR    r11, r5,  #24              \n" /* see PROCESSING BLOCK 1 */
887     "\tAND    r7,  r12, r9               \n"
888     "\tRSB    r11, r11, #256             \n"
889     "\tAND    r8,  r12, r9, LSR #8       \n"
890     "\tMUL    r7,  r7,  r11              \n"
891     "\tAND    r7,  r12, r7, LSR #8       \n"
892     "\tMUL    r8,  r8,  r11              \n"
893     "\tAND    r8,  r8,  r12, LSL #8      \n"
894     "\tORR    r9,  r7,  r8               \n"
895     "\tADD    r9,  r5,  r9               \n"
896 
897     /* PROCESSING BLOCK 4 */
898     /* r6 = src, r10 = dst */
899 
900     "\tLSR    r11, r6,  #24              \n" /* see PROCESSING BLOCK 1 */
901     "\tAND    r7,  r12, r10              \n"
902     "\tRSB    r11, r11, #256             \n"
903     "\tAND    r8,  r12, r10, LSR #8      \n"
904     "\tMUL    r7,  r7,  r11              \n"
905     "\tAND    r7,  r12, r7, LSR #8       \n"
906     "\tMUL    r8,  r8,  r11              \n"
907     "\tAND    r8,  r8,  r12, LSL #8      \n"
908     "\tORR    r10, r7,  r8               \n"
909     "\tADD    r10, r6,  r10              \n"
910 
911     "\tSTM    %[dst]!, {r9, r10}         \n" /* 2nd 2-way storing of processed dst values */
912 
913     "\tCMP    %[src], r2                 \n" /* if our current [src] pointer <= calculated marker */
914     "\tBLE    0b                         \n" /* we could run 4-way processing -> go to dispatcher */
915     "\tBGT    8f                         \n" /* else -> use simple one-by-one processing */
916 
917     /* END OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */
918 
919     /* START OF BLOCK OPTIMIZED FOR ALPHA == 255 */
920 
921     "\t2:                                \n" /* ENTRY 1: LOADING [src] to registers */
922 
923     "\tLDM    %[src]!, {r3, r4, r5, r6}  \n" /* 4-way loading of source values to r3-r6 */
924 
925     "\tAND    r7, r3, r4                 \n" /* if not all alphas == 255 -> */
926     "\tAND    r8, r5, r6                 \n"
927     "\tAND    r9, r7, r8                 \n"
928     "\tCMP    r14, r9, LSR #24           \n"
929     "\tBNE    4f                         \n" /* -> go to alpha == 0 check */
930 
931     "\t3:                                \n" /* ENTRY 2: [src] already loaded by DISPATCHER */
932 
933     "\tSTM    %[dst]!, {r3, r4, r5, r6}  \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */
934 
935     "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer <= marker */
936     "\tBLE    2b                         \n" /* we could run 4-way processing */
937                                              /* because now we're in ALPHA == 255 state */
938                                              /* run next cycle with priority alpha == 255 checks */
939 
940     "\tBGT    8f                         \n" /* if our current [src] array pointer > marker */
941                                              /* use simple one-by-one processing */
942 
943     "\t4:                                \n"
944 
945     "\tORR    r7, r3, r4                 \n" /* if not all alphas == 0 -> */
946     "\tORR    r8, r5, r6                 \n"
947     "\tORR    r9, r7, r8                 \n"
948     "\tLSRS   r9, #24                    \n"
949     "\tBNE    1b                         \n" /* -> go to general processing mode */
950                                              /* (we already checked for alpha == 255) */
951 
952     "\tADD    %[dst], %[dst], #16        \n" /* all src alphas == 0 -> do not change dst values */
953 
954     "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer <= marker */
955     "\tBLE    5f                         \n" /* we could run 4-way processing one more time */
956                                              /* because now we're in ALPHA == 0 state */
957                                              /* run next cycle with priority alpha == 0 checks */
958 
959     "\tBGT    8f                         \n" /* if our current [src] array pointer > marker */
960                                              /* use simple one-by-one processing */
961 
962     /* END OF BLOCK OPTIMIZED FOR ALPHA == 255 */
963 
964     /* START OF BLOCK OPTIMIZED FOR ALPHA == 0 */
965 
966     "\t5:                                \n" /* ENTRY 1: LOADING [src] to registers */
967 
968     "\tLDM    %[src]!, {r3, r4, r5, r6}  \n" /* 4-way loading of source values to r3-r6 */
969 
970     "\tORR    r7, r3, r4                 \n" /* if not all alphas == 0 -> */
971     "\tORR    r8, r5, r6                 \n"
972     "\tORR    r9, r7, r8                 \n"
973     "\tLSRS   r9, #24                    \n"
974     "\tBNE    7f                         \n" /* -> go to alpha == 255 check */
975 
976     "\t6:                                \n" /* ENTRY 2: [src] already loaded by DISPATCHER */
977 
978     "\tADD    %[dst], %[dst], #16        \n" /* all src alphas == 0 -> do not change dst values */
979 
980     "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer <= marker */
981     "\tBLE    5b                         \n" /* we could run 4-way processing one more time */
982                                              /* because now we're in ALPHA == 0 state */
983                                              /* run next cycle with priority alpha == 0 checks */
984 
985     "\tBGT    8f                         \n" /* if our current [src] array pointer > marker */
986                                              /* use simple one-by-one processing */
987     "\t7:                                \n"
988 
989     "\tAND    r7, r3, r4                 \n" /* if not all alphas == 255 -> */
990     "\tAND    r8, r5, r6                 \n"
991     "\tAND    r9, r7, r8                 \n"
992     "\tCMP    r14, r9, LSR #24           \n"
993     "\tBNE    1b                         \n" /* -> go to general processing mode */
994                                              /* (we already checked for alpha == 0) */
995 
996     "\tSTM    %[dst]!, {r3, r4, r5, r6}  \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */
997 
998     "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer <= marker */
999     "\tBLE    2b                         \n" /* we could run 4-way processing one more time */
1000                                              /* because now we're in ALPHA == 255 state */
1001                                              /* run next cycle with priority alpha == 255 checks */
1002 
1003     "\tBGT    8f                         \n" /* if our current [src] array pointer > marker */
1004                                              /* use simple one-by-one processing */
1005 
1006     /* END OF BLOCK OPTIMIZED FOR ALPHA == 0 */
1007 
1008     /* START OF TAIL BLOCK */
1009     /* (used when array is too small to be processed with 4-way algorithm)*/
1010 
1011     "\t8:                                \n"
1012 
1013     "\tADD    r2, r2, #16                \n" /* now r2 points to the element just after array */
1014                                              /* we've done r2 = r2 - 16 at procedure start */
1015 
1016     "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer > final marker */
1017     "\tBEQ    9f                         \n" /* goto EXIT */
1018 
1019     /* TAIL PROCESSING BLOCK 1 */
1020 
1021     "\tLDR    r3, [%[src]], #4           \n" /* r3 = *src, src++ */
1022     "\tLDR    r7, [%[dst]]               \n" /* r7 = *dst */
1023 
1024     "\tLSR    r11, r3,  #24              \n" /* extracting alpha from source */
1025     "\tAND    r9,  r12, r7               \n" /* r9 = br masked by r12 (0xff00ff) */
1026     "\tRSB    r11, r11, #256             \n" /* subtracting the alpha from 255 -> r11 = scale */
1027     "\tAND    r10, r12, r7, LSR #8       \n" /* r10 = ag masked by r12 (0xff00ff) */
1028     "\tMUL    r9,  r9,  r11              \n" /* br = br * scale */
1029     "\tAND    r9,  r12, r9, LSR #8       \n" /* lsr br by 8 and mask it */
1030     "\tMUL    r10, r10, r11              \n" /* ag = ag * scale */
1031     "\tAND    r10, r10, r12, LSL #8      \n" /* mask ag with reverse mask */
1032     "\tORR    r7,  r9,  r10              \n" /* br | ag */
1033     "\tADD    r7,  r3,  r7               \n" /* dst = src + calc dest(r8) */
1034 
1035     "\tSTR    r7, [%[dst]], #4           \n" /* *dst = r7; dst++ */
1036 
1037     "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer > final marker */
1038     "\tBEQ    9f                         \n" /* goto EXIT */
1039 
1040     /* TAIL PROCESSING BLOCK 2 */
1041 
1042     "\tLDR    r3, [%[src]], #4           \n" /* see TAIL PROCESSING BLOCK 1 */
1043     "\tLDR    r7, [%[dst]]               \n"
1044 
1045     "\tLSR    r11, r3,  #24              \n"
1046     "\tAND    r9,  r12, r7               \n"
1047     "\tRSB    r11, r11, #256             \n"
1048     "\tAND    r10, r12, r7, LSR #8       \n"
1049     "\tMUL    r9,  r9,  r11              \n"
1050     "\tAND    r9,  r12, r9, LSR #8       \n"
1051     "\tMUL    r10, r10, r11              \n"
1052     "\tAND    r10, r10, r12, LSL #8      \n"
1053     "\tORR    r7,  r9,  r10              \n"
1054     "\tADD    r7,  r3,  r7               \n"
1055 
1056     "\tSTR    r7, [%[dst]], #4           \n"
1057 
1058     "\tCMP    %[src], r2                 \n"
1059     "\tBEQ    9f                         \n"
1060 
1061     /* TAIL PROCESSING BLOCK 3 */
1062 
1063     "\tLDR    r3, [%[src]], #4           \n" /* see TAIL PROCESSING BLOCK 1 */
1064     "\tLDR    r7, [%[dst]]               \n"
1065 
1066     "\tLSR    r11, r3,  #24              \n"
1067     "\tAND    r9,  r12, r7               \n"
1068     "\tRSB    r11, r11, #256             \n"
1069     "\tAND    r10, r12, r7, LSR #8       \n"
1070     "\tMUL    r9,  r9,  r11              \n"
1071     "\tAND    r9,  r12, r9, LSR #8       \n"
1072     "\tMUL    r10, r10, r11              \n"
1073     "\tAND    r10, r10, r12, LSL #8      \n"
1074     "\tORR    r7,  r9,  r10              \n"
1075     "\tADD    r7,  r3,  r7               \n"
1076 
1077     "\tSTR    r7, [%[dst]], #4           \n"
1078 
1079     /* END OF TAIL BLOCK */
1080 
1081     "\t9:                                \n" /* EXIT */
1082 
1083     "\tLDMIA  r13!, {r4-r12, r14}        \n" /* restoring r4-r12, lr from stack */
1084     "\tBX     lr                         \n" /* return */
1085 
1086     : [dst] "+r" (dst), [src] "+r" (src)
1087     :
1088     : "cc", "r2", "r3", "memory"
1089 
1090     );
1091 
1092 }
1093 
1094 #define	S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_arm_test_alpha
1095 #else /* !defined(TEST_SRC_ALPHA) */
1096 
S32A_Opaque_BlitRow32_arm(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)1097 static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
1098                                   const SkPMColor* SK_RESTRICT src,
1099                                   int count, U8CPU alpha) {
1100 
1101     SkASSERT(255 == alpha);
1102 
1103     /* Does not support the TEST_SRC_ALPHA case */
1104     asm volatile (
1105                   "cmp    %[count], #0               \n\t" /* comparing count with 0 */
1106                   "beq    3f                         \n\t" /* if zero exit */
1107 
1108                   "mov    ip, #0xff                  \n\t" /* load the 0xff mask in ip */
1109                   "orr    ip, ip, ip, lsl #16        \n\t" /* convert it to 0xff00ff in ip */
1110 
1111                   "cmp    %[count], #2               \n\t" /* compare count with 2 */
1112                   "blt    2f                         \n\t" /* if less than 2 -> single loop */
1113 
1114                   /* Double Loop */
1115                   "1:                                \n\t" /* <double loop> */
1116                   "ldm    %[src]!, {r5,r6}           \n\t" /* load the src(s) at r5-r6 */
1117                   "ldm    %[dst], {r7,r8}            \n\t" /* loading dst(s) into r7-r8 */
1118                   "lsr    r4, r5, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
1119 
1120                   /* ----------- */
1121                   "and    r9, ip, r7                 \n\t" /* r9 = br masked by ip */
1122                   "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 256 -> r4=scale */
1123                   "and    r10, ip, r7, lsr #8        \n\t" /* r10 = ag masked by ip */
1124 
1125                   "mul    r9, r9, r4                 \n\t" /* br = br * scale */
1126                   "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
1127                   "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
1128 
1129                   "and    r10, r10, ip, lsl #8       \n\t" /* mask ag with reverse mask */
1130                   "lsr    r4, r6, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
1131                   "orr    r7, r9, r10                \n\t" /* br | ag*/
1132 
1133                   "add    r7, r5, r7                 \n\t" /* dst = src + calc dest(r7) */
1134                   "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 255 -> r4=scale */
1135 
1136                   /* ----------- */
1137                   "and    r9, ip, r8                 \n\t" /* r9 = br masked by ip */
1138 
1139                   "and    r10, ip, r8, lsr #8        \n\t" /* r10 = ag masked by ip */
1140                   "mul    r9, r9, r4                 \n\t" /* br = br * scale */
1141                   "sub    %[count], %[count], #2     \n\t"
1142                   "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
1143 
1144                   "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
1145                   "and    r10, r10, ip, lsl #8       \n\t" /* mask ag with reverse mask */
1146                   "cmp    %[count], #1               \n\t" /* comparing count with 1 */
1147                   "orr    r8, r9, r10                \n\t" /* br | ag */
1148 
1149                   "add    r8, r6, r8                 \n\t" /* dst = src + calc dest(r8) */
1150 
1151                   /* ----------------- */
1152                   "stm    %[dst]!, {r7,r8}           \n\t" /* *dst = r7, increment dst by two (each times 4) */
1153                   /* ----------------- */
1154 
1155                   "bgt    1b                         \n\t" /* if greater than 1 -> reloop */
1156                   "blt    3f                         \n\t" /* if less than 1 -> exit */
1157 
1158                   /* Single Loop */
1159                   "2:                                \n\t" /* <single loop> */
1160                   "ldr    r5, [%[src]], #4           \n\t" /* load the src pointer into r5 r5=src */
1161                   "ldr    r7, [%[dst]]               \n\t" /* loading dst into r7 */
1162                   "lsr    r4, r5, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
1163 
1164                   /* ----------- */
1165                   "and    r9, ip, r7                 \n\t" /* r9 = br masked by ip */
1166                   "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 256 -> r4=scale */
1167 
1168                   "and    r10, ip, r7, lsr #8        \n\t" /* r10 = ag masked by ip */
1169                   "mul    r9, r9, r4                 \n\t" /* br = br * scale */
1170                   "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
1171                   "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
1172 
1173                   "and    r10, r10, ip, lsl #8       \n\t" /* mask ag */
1174                   "orr    r7, r9, r10                \n\t" /* br | ag */
1175 
1176                   "add    r7, r5, r7                 \n\t" /* *dst = src + calc dest(r7) */
1177 
1178                   /* ----------------- */
1179                   "str    r7, [%[dst]], #4           \n\t" /* *dst = r7, increment dst by one (times 4) */
1180                   /* ----------------- */
1181 
1182                   "3:                                \n\t" /* <exit> */
1183                   : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
1184                   :
1185                   : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory"
1186                   );
1187 }
1188 #define	S32A_Opaque_BlitRow32_PROC	S32A_Opaque_BlitRow32_arm
1189 #endif /* !defined(TEST_SRC_ALPHA) */
1190 #else /* ... #elif defined (__ARM_ARCH__) */
1191 #define	S32A_Opaque_BlitRow32_PROC	NULL
1192 #endif
1193 
1194 /*
1195  * ARM asm version of S32A_Blend_BlitRow32
1196  */
S32A_Blend_BlitRow32_arm(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)1197 static void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
1198                                  const SkPMColor* SK_RESTRICT src,
1199                                  int count, U8CPU alpha) {
1200     asm volatile (
1201                   "cmp    %[count], #0               \n\t" /* comparing count with 0 */
1202                   "beq    3f                         \n\t" /* if zero exit */
1203 
1204                   "mov    r12, #0xff                 \n\t" /* load the 0xff mask in r12 */
1205                   "orr    r12, r12, r12, lsl #16     \n\t" /* convert it to 0xff00ff in r12 */
1206 
1207                   /* src1,2_scale */
1208                   "add    %[alpha], %[alpha], #1     \n\t" /* loading %[alpha]=src_scale=alpha+1 */
1209 
1210                   "cmp    %[count], #2               \n\t" /* comparing count with 2 */
1211                   "blt    2f                         \n\t" /* if less than 2 -> single loop */
1212 
1213                   /* Double Loop */
1214                   "1:                                \n\t" /* <double loop> */
1215                   "ldm    %[src]!, {r5, r6}          \n\t" /* loading src pointers into r5 and r6 */
1216                   "ldm    %[dst], {r7, r8}           \n\t" /* loading dst pointers into r7 and r8 */
1217 
1218                   /* dst1_scale and dst2_scale*/
1219                   "lsr    r9, r5, #24                \n\t" /* src >> 24 */
1220                   "lsr    r10, r6, #24               \n\t" /* src >> 24 */
1221                   "smulbb r9, r9, %[alpha]           \n\t" /* r9 = SkMulS16 r9 with src_scale */
1222                   "smulbb r10, r10, %[alpha]         \n\t" /* r10 = SkMulS16 r10 with src_scale */
1223                   "lsr    r9, r9, #8                 \n\t" /* r9 >> 8 */
1224                   "lsr    r10, r10, #8               \n\t" /* r10 >> 8 */
1225                   "rsb    r9, r9, #256               \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */
1226                   "rsb    r10, r10, #256             \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */
1227 
1228                   /* ---------------------- */
1229 
1230                   /* src1, src1_scale */
1231                   "and    r11, r12, r5, lsr #8       \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */
1232                   "and    r4, r12, r5                \n\t" /* rb = r4 = r5 masked by r12 */
1233                   "mul    r11, r11, %[alpha]         \n\t" /* ag = r11 times src_scale */
1234                   "mul    r4, r4, %[alpha]           \n\t" /* rb = r4 times src_scale */
1235                   "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
1236                   "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
1237                   "orr    r5, r11, r4                \n\t" /* r5 = (src1, src_scale) */
1238 
1239                   /* dst1, dst1_scale */
1240                   "and    r11, r12, r7, lsr #8       \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */
1241                   "and    r4, r12, r7                \n\t" /* rb = r4 = r7 masked by r12 */
1242                   "mul    r11, r11, r9               \n\t" /* ag = r11 times dst_scale (r9) */
1243                   "mul    r4, r4, r9                 \n\t" /* rb = r4 times dst_scale (r9) */
1244                   "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
1245                   "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
1246                   "orr    r9, r11, r4                \n\t" /* r9 = (dst1, dst_scale) */
1247 
1248                   /* ---------------------- */
1249                   "add    r9, r5, r9                 \n\t" /* *dst = src plus dst both scaled */
1250                   /* ---------------------- */
1251 
1252                   /* ====================== */
1253 
1254                   /* src2, src2_scale */
1255                   "and    r11, r12, r6, lsr #8       \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */
1256                   "and    r4, r12, r6                \n\t" /* rb = r4 = r6 masked by r12 */
1257                   "mul    r11, r11, %[alpha]         \n\t" /* ag = r11 times src_scale */
1258                   "mul    r4, r4, %[alpha]           \n\t" /* rb = r4 times src_scale */
1259                   "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
1260                   "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
1261                   "orr    r6, r11, r4                \n\t" /* r6 = (src2, src_scale) */
1262 
1263                   /* dst2, dst2_scale */
1264                   "and    r11, r12, r8, lsr #8       \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */
1265                   "and    r4, r12, r8                \n\t" /* rb = r4 = r8 masked by r12 */
1266                   "mul    r11, r11, r10              \n\t" /* ag = r11 times dst_scale (r10) */
1267                   "mul    r4, r4, r10                \n\t" /* rb = r4 times dst_scale (r6) */
1268                   "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
1269                   "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
1270                   "orr    r10, r11, r4               \n\t" /* r10 = (dst2, dst_scale) */
1271 
1272                   "sub    %[count], %[count], #2     \n\t" /* decrease count by 2 */
1273                   /* ---------------------- */
1274                   "add    r10, r6, r10               \n\t" /* *dst = src plus dst both scaled */
1275                   /* ---------------------- */
1276                   "cmp    %[count], #1               \n\t" /* compare count with 1 */
1277                   /* ----------------- */
1278                   "stm    %[dst]!, {r9, r10}         \n\t" /* copy r9 and r10 to r7 and r8 respectively */
1279                   /* ----------------- */
1280 
1281                   "bgt    1b                         \n\t" /* if %[count] greater than 1 reloop */
1282                   "blt    3f                         \n\t" /* if %[count] less than 1 exit */
1283                                                            /* else get into the single loop */
1284                   /* Single Loop */
1285                   "2:                                \n\t" /* <single loop> */
1286                   "ldr    r5, [%[src]], #4           \n\t" /* loading src pointer into r5: r5=src */
1287                   "ldr    r7, [%[dst]]               \n\t" /* loading dst pointer into r7: r7=dst */
1288 
1289                   "lsr    r6, r5, #24                \n\t" /* src >> 24 */
1290                   "and    r8, r12, r5, lsr #8        \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */
1291                   "smulbb r6, r6, %[alpha]           \n\t" /* r6 = SkMulS16 with src_scale */
1292                   "and    r9, r12, r5                \n\t" /* rb = r9 = r5 masked by r12 */
1293                   "lsr    r6, r6, #8                 \n\t" /* r6 >> 8 */
1294                   "mul    r8, r8, %[alpha]           \n\t" /* ag = r8 times scale */
1295                   "rsb    r6, r6, #256               \n\t" /* r6 = 255 - r6 + 1 */
1296 
1297                   /* src, src_scale */
1298                   "mul    r9, r9, %[alpha]           \n\t" /* rb = r9 times scale */
1299                   "and    r8, r8, r12, lsl #8        \n\t" /* ag masked by reverse mask (r12) */
1300                   "and    r9, r12, r9, lsr #8        \n\t" /* rb masked by mask (r12) */
1301                   "orr    r10, r8, r9                \n\t" /* r10 = (scr, src_scale) */
1302 
1303                   /* dst, dst_scale */
1304                   "and    r8, r12, r7, lsr #8        \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */
1305                   "and    r9, r12, r7                \n\t" /* rb = r9 = r7 masked by r12 */
1306                   "mul    r8, r8, r6                 \n\t" /* ag = r8 times scale (r6) */
1307                   "mul    r9, r9, r6                 \n\t" /* rb = r9 times scale (r6) */
1308                   "and    r8, r8, r12, lsl #8        \n\t" /* ag masked by reverse mask (r12) */
1309                   "and    r9, r12, r9, lsr #8        \n\t" /* rb masked by mask (r12) */
1310                   "orr    r7, r8, r9                 \n\t" /* r7 = (dst, dst_scale) */
1311 
1312                   "add    r10, r7, r10               \n\t" /* *dst = src plus dst both scaled */
1313 
1314                   /* ----------------- */
1315                   "str    r10, [%[dst]], #4          \n\t" /* *dst = r10, postincrement dst by one (times 4) */
1316                   /* ----------------- */
1317 
1318                   "3:                                \n\t" /* <exit> */
1319                   : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha)
1320                   :
1321                   : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory"
1322                   );
1323 
1324 }
1325 #define	S32A_Blend_BlitRow32_PROC	S32A_Blend_BlitRow32_arm
1326 
1327 /* Neon version of S32_Blend_BlitRow32()
1328  * portable version is in src/core/SkBlitRow_D32.cpp
1329  */
1330 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
S32_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)1331 static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1332                                 const SkPMColor* SK_RESTRICT src,
1333                                 int count, U8CPU alpha) {
1334     SkASSERT(alpha <= 255);
1335     if (count > 0) {
1336         uint16_t src_scale = SkAlpha255To256(alpha);
1337         uint16_t dst_scale = 256 - src_scale;
1338 
1339 	/* run them N at a time through the NEON unit */
1340 	/* note that each 1 is 4 bytes, each treated exactly the same,
1341 	 * so we can work under that guise. We *do* know that the src&dst
1342 	 * will be 32-bit aligned quantities, so we can specify that on
1343 	 * the load/store ops and do a neon 'reinterpret' to get us to
1344 	 * byte-sized (pun intended) pieces that we widen/multiply/shift
1345 	 * we're limited at 128 bits in the wide ops, which is 8x16bits
1346 	 * or a pair of 32 bit src/dsts.
1347 	 */
1348 	/* we *could* manually unroll this loop so that we load 128 bits
1349 	 * (as a pair of 64s) from each of src and dst, processing them
1350 	 * in pieces. This might give us a little better management of
1351 	 * the memory latency, but my initial attempts here did not
1352 	 * produce an instruction stream that looked all that nice.
1353 	 */
1354 #define	UNROLL	2
1355 	while (count >= UNROLL) {
1356 	    uint8x8_t  src_raw, dst_raw, dst_final;
1357 	    uint16x8_t  src_wide, dst_wide;
1358 
1359 	    /* get 64 bits of src, widen it, multiply by src_scale */
1360 	    src_raw = vreinterpret_u8_u32(vld1_u32(src));
1361 	    src_wide = vmovl_u8(src_raw);
1362 	    /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
1363 	    src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
1364 
1365 	    /* ditto with dst */
1366 	    dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
1367 	    dst_wide = vmovl_u8(dst_raw);
1368 
1369 	    /* combine add with dst multiply into mul-accumulate */
1370 	    dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
1371 
1372 	    dst_final = vshrn_n_u16(dst_wide, 8);
1373 	    vst1_u32(dst, vreinterpret_u32_u8(dst_final));
1374 
1375 	    src += UNROLL;
1376 	    dst += UNROLL;
1377 	    count -= UNROLL;
1378 	}
1379 	/* RBE: well, i don't like how gcc manages src/dst across the above
1380 	 * loop it's constantly calculating src+bias, dst+bias and it only
1381 	 * adjusts the real ones when we leave the loop. Not sure why
1382 	 * it's "hoisting down" (hoisting implies above in my lexicon ;))
1383 	 * the adjustments to src/dst/count, but it does...
1384 	 * (might be SSA-style internal logic...
1385 	 */
1386 
1387 #if	UNROLL == 2
1388 	if (count == 1) {
1389             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
1390 	}
1391 #else
1392 	if (count > 0) {
1393             do {
1394                 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
1395                 src += 1;
1396                 dst += 1;
1397             } while (--count > 0);
1398 	}
1399 #endif
1400 
1401 #undef	UNROLL
1402     }
1403 }
1404 
1405 #define	S32_Blend_BlitRow32_PROC	S32_Blend_BlitRow32_neon
1406 #else
1407 #define	S32_Blend_BlitRow32_PROC	NULL
1408 #endif
1409 
1410 ///////////////////////////////////////////////////////////////////////////////
1411 
1412 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
1413 
1414 #undef	DEBUG_OPAQUE_DITHER
1415 
1416 #if	defined(DEBUG_OPAQUE_DITHER)
showme8(char * str,void * p,int len)1417 static void showme8(char *str, void *p, int len)
1418 {
1419 	static char buf[256];
1420 	char tbuf[32];
1421 	int i;
1422 	char *pc = (char*) p;
1423 	sprintf(buf,"%8s:", str);
1424 	for(i=0;i<len;i++) {
1425 	    sprintf(tbuf, "   %02x", pc[i]);
1426 	    strcat(buf, tbuf);
1427 	}
1428 	SkDebugf("%s\n", buf);
1429 }
showme16(char * str,void * p,int len)1430 static void showme16(char *str, void *p, int len)
1431 {
1432 	static char buf[256];
1433 	char tbuf[32];
1434 	int i;
1435 	uint16_t *pc = (uint16_t*) p;
1436 	sprintf(buf,"%8s:", str);
1437 	len = (len / sizeof(uint16_t));	/* passed as bytes */
1438 	for(i=0;i<len;i++) {
1439 	    sprintf(tbuf, " %04x", pc[i]);
1440 	    strcat(buf, tbuf);
1441 	}
1442 	SkDebugf("%s\n", buf);
1443 }
1444 #endif
1445 
S32A_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1446 static void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1447                                       const SkPMColor* SK_RESTRICT src,
1448                                       int count, U8CPU alpha, int x, int y) {
1449     SkASSERT(255 == alpha);
1450 
1451 #define	UNROLL	8
1452 
1453     if (count >= UNROLL) {
1454 	uint8x8_t dbase;
1455 
1456 #if	defined(DEBUG_OPAQUE_DITHER)
1457 	uint16_t tmpbuf[UNROLL];
1458 	int td[UNROLL];
1459 	int tdv[UNROLL];
1460 	int ta[UNROLL];
1461 	int tap[UNROLL];
1462 	uint16_t in_dst[UNROLL];
1463 	int offset = 0;
1464 	int noisy = 0;
1465 #endif
1466 
1467 	const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1468 	dbase = vld1_u8(dstart);
1469 
1470         do {
1471 	    uint8x8_t sr, sg, sb, sa, d;
1472 	    uint16x8_t dst8, scale8, alpha8;
1473 	    uint16x8_t dst_r, dst_g, dst_b;
1474 
1475 #if	defined(DEBUG_OPAQUE_DITHER)
1476 	/* calculate 8 elements worth into a temp buffer */
1477 	{
1478 	  int my_y = y;
1479 	  int my_x = x;
1480 	  SkPMColor* my_src = (SkPMColor*)src;
1481 	  uint16_t* my_dst = dst;
1482 	  int i;
1483 
1484           DITHER_565_SCAN(my_y);
1485           for(i=0;i<UNROLL;i++) {
1486             SkPMColor c = *my_src++;
1487             SkPMColorAssert(c);
1488             if (c) {
1489                 unsigned a = SkGetPackedA32(c);
1490 
1491                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1492 		tdv[i] = DITHER_VALUE(my_x);
1493 		ta[i] = a;
1494 		tap[i] = SkAlpha255To256(a);
1495 		td[i] = d;
1496 
1497                 unsigned sr = SkGetPackedR32(c);
1498                 unsigned sg = SkGetPackedG32(c);
1499                 unsigned sb = SkGetPackedB32(c);
1500                 sr = SkDITHER_R32_FOR_565(sr, d);
1501                 sg = SkDITHER_G32_FOR_565(sg, d);
1502                 sb = SkDITHER_B32_FOR_565(sb, d);
1503 
1504                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1505                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1506                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1507                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1508                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1509 		td[i] = d;
1510 
1511             } else {
1512 		tmpbuf[i] = *my_dst;
1513 		ta[i] = tdv[i] = td[i] = 0xbeef;
1514 	    }
1515 	    in_dst[i] = *my_dst;
1516             my_dst += 1;
1517             DITHER_INC_X(my_x);
1518           }
1519 	}
1520 #endif
1521 
1522 	    /* source is in ABGR */
1523 	    {
1524 		register uint8x8_t d0 asm("d0");
1525 		register uint8x8_t d1 asm("d1");
1526 		register uint8x8_t d2 asm("d2");
1527 		register uint8x8_t d3 asm("d3");
1528 
1529 		asm ("vld4.8	{d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1530 		    : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1531 		    : "r" (src)
1532                     );
1533 		    sr = d0; sg = d1; sb = d2; sa = d3;
1534 	    }
1535 
1536 	    /* calculate 'd', which will be 0..7 */
1537 	    /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
1538 #if defined(SK_BUILD_FOR_ANDROID)
1539 	    /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1540 	    alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
1541 #else
1542 	    alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
1543 #endif
1544 	    alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
1545 	    d = vshrn_n_u16(alpha8, 8);	/* narrowing too */
1546 
1547 	    /* sr = sr - (sr>>5) + d */
1548 	    /* watching for 8-bit overflow.  d is 0..7; risky range of
1549 	     * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1550 	     * safe  as long as we do ((sr-sr>>5) + d) */
1551 	    sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1552 	    sr = vadd_u8(sr, d);
1553 
1554 	    /* sb = sb - (sb>>5) + d */
1555 	    sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1556 	    sb = vadd_u8(sb, d);
1557 
1558 	    /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1559 	    sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1560 	    sg = vadd_u8(sg, vshr_n_u8(d,1));
1561 
1562 	    /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
1563 	    dst8 = vld1q_u16(dst);
1564 	    dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
1565 	    dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
1566 	    dst_r = vshrq_n_u16(dst8,11);	/* clearing hi bits */
1567 
1568 	    /* blend */
1569 #if 1
1570 	    /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1571 	    /* originally 255-sa + 1 */
1572 	    scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1573 #else
1574 	    scale8 = vsubw_u8(vdupq_n_u16(255), sa);
1575 	    scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
1576 #endif
1577 
1578 #if 1
1579 	    /* combine the addq and mul, save 3 insns */
1580 	    scale8 = vshrq_n_u16(scale8, 3);
1581 	    dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1582 	    dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1583 	    dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1584 #else
1585 	    /* known correct, but +3 insns over above */
1586 	    scale8 = vshrq_n_u16(scale8, 3);
1587 	    dst_b = vmulq_u16(dst_b, scale8);
1588 	    dst_g = vmulq_u16(dst_g, scale8);
1589 	    dst_r = vmulq_u16(dst_r, scale8);
1590 
1591 	    /* combine */
1592 	    /* NB: vshll widens, need to preserve those bits */
1593 	    dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
1594 	    dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
1595 	    dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
1596 #endif
1597 
1598 	    /* repack to store */
1599 	    dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
1600 	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1601 	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1602 
1603 	    vst1q_u16(dst, dst8);
1604 
1605 #if	defined(DEBUG_OPAQUE_DITHER)
1606 	    /* verify my 8 elements match the temp buffer */
1607 	{
1608 	   int i, bad=0;
1609 	   static int invocation;
1610 
1611 	   for (i=0;i<UNROLL;i++)
1612 		if (tmpbuf[i] != dst[i]) bad=1;
1613 	   if (bad) {
1614 		SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1615 			invocation, offset);
1616 		SkDebugf("  alpha 0x%x\n", alpha);
1617 		for (i=0;i<UNROLL;i++)
1618 		    SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1619 			i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
1620 			dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
1621 
1622 		showme16("alpha8", &alpha8, sizeof(alpha8));
1623 		showme16("scale8", &scale8, sizeof(scale8));
1624 		showme8("d", &d, sizeof(d));
1625 		showme16("dst8", &dst8, sizeof(dst8));
1626 		showme16("dst_b", &dst_b, sizeof(dst_b));
1627 		showme16("dst_g", &dst_g, sizeof(dst_g));
1628 		showme16("dst_r", &dst_r, sizeof(dst_r));
1629 		showme8("sb", &sb, sizeof(sb));
1630 		showme8("sg", &sg, sizeof(sg));
1631 		showme8("sr", &sr, sizeof(sr));
1632 
1633 		/* cop out */
1634 		return;
1635 	   }
1636 	   offset += UNROLL;
1637 	   invocation++;
1638 	}
1639 #endif
1640 
1641             dst += UNROLL;
1642 	    src += UNROLL;
1643 	    count -= UNROLL;
1644 	    /* skip x += UNROLL, since it's unchanged mod-4 */
1645         } while (count >= UNROLL);
1646     }
1647 #undef	UNROLL
1648 
1649     /* residuals */
1650     if (count > 0) {
1651         DITHER_565_SCAN(y);
1652         do {
1653             SkPMColor c = *src++;
1654             SkPMColorAssert(c);
1655             if (c) {
1656                 unsigned a = SkGetPackedA32(c);
1657 
1658                 // dither and alpha are just temporary variables to work-around
1659                 // an ICE in debug.
1660                 unsigned dither = DITHER_VALUE(x);
1661                 unsigned alpha = SkAlpha255To256(a);
1662                 int d = SkAlphaMul(dither, alpha);
1663 
1664                 unsigned sr = SkGetPackedR32(c);
1665                 unsigned sg = SkGetPackedG32(c);
1666                 unsigned sb = SkGetPackedB32(c);
1667                 sr = SkDITHER_R32_FOR_565(sr, d);
1668                 sg = SkDITHER_G32_FOR_565(sg, d);
1669                 sb = SkDITHER_B32_FOR_565(sb, d);
1670 
1671                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1672                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1673                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1674                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1675                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1676             }
1677             dst += 1;
1678             DITHER_INC_X(x);
1679         } while (--count != 0);
1680     }
1681 }
1682 
1683 #define	S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon
1684 #else
1685 #define	S32A_D565_Opaque_Dither_PROC NULL
1686 #endif
1687 
1688 ///////////////////////////////////////////////////////////////////////////////
1689 
1690 #if	defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
1691 /* 2009/10/27: RBE says "a work in progress"; debugging says ok;
1692  * speedup untested, but ARM version is 26 insns/iteration and
1693  * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
1694  * which is 10x the native version; that's pure instruction counts,
1695  * not accounting for any instruction or memory latencies.
1696  */
1697 
1698 #undef	DEBUG_S32_OPAQUE_DITHER
1699 
S32_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1700 static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1701                                      const SkPMColor* SK_RESTRICT src,
1702                                      int count, U8CPU alpha, int x, int y) {
1703     SkASSERT(255 == alpha);
1704 
1705 #define	UNROLL	8
1706     if (count >= UNROLL) {
1707 	uint8x8_t d;
1708 	const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1709 	d = vld1_u8(dstart);
1710 
1711 	while (count >= UNROLL) {
1712 	    uint8x8_t sr, sg, sb, sa;
1713 	    uint16x8_t dr, dg, db, da;
1714 	    uint16x8_t dst8;
1715 
1716 	    /* source is in ABGR ordering (R == lsb) */
1717 	    {
1718 		register uint8x8_t d0 asm("d0");
1719 		register uint8x8_t d1 asm("d1");
1720 		register uint8x8_t d2 asm("d2");
1721 		register uint8x8_t d3 asm("d3");
1722 
1723 		asm ("vld4.8	{d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1724 		    : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1725 		    : "r" (src)
1726                     );
1727 		    sr = d0; sg = d1; sb = d2; sa = d3;
1728 	    }
1729 	    /* XXX: if we want to prefetch, hide it in the above asm()
1730 	     * using the gcc __builtin_prefetch(), the prefetch will
1731 	     * fall to the bottom of the loop -- it won't stick up
1732 	     * at the top of the loop, just after the vld4.
1733 	     */
1734 
1735 	    /* sr = sr - (sr>>5) + d */
1736 	    sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1737 	    dr = vaddl_u8(sr, d);
1738 
1739 	    /* sb = sb - (sb>>5) + d */
1740 	    sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1741 	    db = vaddl_u8(sb, d);
1742 
1743 	    /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1744 	    sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1745 	    dg = vaddl_u8(sg, vshr_n_u8(d,1));
1746 	    /* XXX: check that the "d>>1" here is hoisted */
1747 
1748 	    /* pack high bits of each into 565 format  (rgb, b is lsb) */
1749 	    dst8 = vshrq_n_u16(db, 3);
1750 	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1751 	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
1752 
1753 	    /* store it */
1754 	    vst1q_u16(dst, dst8);
1755 
1756 #if	defined(DEBUG_S32_OPAQUE_DITHER)
1757 	    /* always good to know if we generated good results */
1758 	    {
1759 		int i, myx = x, myy = y;
1760 		DITHER_565_SCAN(myy);
1761 		for (i=0;i<UNROLL;i++) {
1762 		    SkPMColor c = src[i];
1763 		    unsigned dither = DITHER_VALUE(myx);
1764 		    uint16_t val = SkDitherRGB32To565(c, dither);
1765 		    if (val != dst[i]) {
1766 			SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1767 			    c, dither, val, dst[i], dstart[i]);
1768 		    }
1769 		    DITHER_INC_X(myx);
1770 		}
1771 	    }
1772 #endif
1773 
1774 	    dst += UNROLL;
1775 	    src += UNROLL;
1776 	    count -= UNROLL;
1777 	    x += UNROLL;		/* probably superfluous */
1778 	}
1779     }
1780 #undef	UNROLL
1781 
1782     /* residuals */
1783     if (count > 0) {
1784         DITHER_565_SCAN(y);
1785         do {
1786             SkPMColor c = *src++;
1787             SkPMColorAssert(c);
1788             SkASSERT(SkGetPackedA32(c) == 255);
1789 
1790             unsigned dither = DITHER_VALUE(x);
1791             *dst++ = SkDitherRGB32To565(c, dither);
1792             DITHER_INC_X(x);
1793         } while (--count != 0);
1794     }
1795 }
1796 
1797 #define	S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon
1798 #else
1799 #define	S32_D565_Opaque_Dither_PROC NULL
1800 #endif
1801 
1802 ///////////////////////////////////////////////////////////////////////////////
1803 
1804 static const SkBlitRow::Proc platform_565_procs[] = {
1805     // no dither
1806     S32_D565_Opaque_PROC,
1807     S32_D565_Blend_PROC,
1808     S32A_D565_Opaque_PROC,
1809     S32A_D565_Blend_PROC,
1810 
1811     // dither
1812     S32_D565_Opaque_Dither_PROC,
1813     S32_D565_Blend_Dither_PROC,
1814     S32A_D565_Opaque_Dither_PROC,
1815     NULL,   // S32A_D565_Blend_Dither
1816 };
1817 
1818 static const SkBlitRow::Proc platform_4444_procs[] = {
1819     // no dither
1820     NULL,   // S32_D4444_Opaque,
1821     NULL,   // S32_D4444_Blend,
1822     NULL,   // S32A_D4444_Opaque,
1823     NULL,   // S32A_D4444_Blend,
1824 
1825     // dither
1826     NULL,   // S32_D4444_Opaque_Dither,
1827     NULL,   // S32_D4444_Blend_Dither,
1828     NULL,   // S32A_D4444_Opaque_Dither,
1829     NULL,   // S32A_D4444_Blend_Dither
1830 };
1831 
1832 static const SkBlitRow::Proc32 platform_32_procs[] = {
1833     NULL,   // S32_Opaque,
1834     S32_Blend_BlitRow32_PROC,		// S32_Blend,
1835     S32A_Opaque_BlitRow32_PROC,		// S32A_Opaque,
1836     S32A_Blend_BlitRow32_PROC		// S32A_Blend
1837 };
1838 
PlatformProcs4444(unsigned flags)1839 SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) {
1840     return platform_4444_procs[flags];
1841 }
1842 
PlatformProcs565(unsigned flags)1843 SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
1844     return platform_565_procs[flags];
1845 }
1846 
PlatformProcs32(unsigned flags)1847 SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
1848     return platform_32_procs[flags];
1849 }
1850 
PlatformColorProc()1851 SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
1852     return NULL;
1853 }
1854 
1855 ///////////////////////////////////////////////////////////////////////////////
1856 
PlatformColorProcs(SkBitmap::Config dstConfig,SkMask::Format maskFormat,SkColor color)1857 SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
1858                                                      SkMask::Format maskFormat,
1859                                                      SkColor color) {
1860     return NULL;
1861 }
1862 
PlatformBlitRowProcs16(bool isOpaque)1863 SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
1864     return NULL;
1865 }
1866 
PlatformRowProcs(SkBitmap::Config dstConfig,SkMask::Format maskFormat,RowFlags flags)1867 SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig,
1868                                                  SkMask::Format maskFormat,
1869                                                  RowFlags flags) {
1870     return NULL;
1871 }
1872