1 /*
2 * Copyright 2009 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8
9 #include "SkBlitRow.h"
10 #include "SkBlitMask.h"
11 #include "SkColorPriv.h"
12 #include "SkDither.h"
13
14 #if defined(__ARM_HAVE_NEON)
15 #include <arm_neon.h>
16 #endif
17
18 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
S32A_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)19 static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
20 const SkPMColor* SK_RESTRICT src, int count,
21 U8CPU alpha, int /*x*/, int /*y*/) {
22 SkASSERT(255 == alpha);
23
24 if (count >= 8) {
25 uint16_t* SK_RESTRICT keep_dst;
26
27 asm volatile (
28 "ands ip, %[count], #7 \n\t"
29 "vmov.u8 d31, #1<<7 \n\t"
30 "vld1.16 {q12}, [%[dst]] \n\t"
31 "vld4.8 {d0-d3}, [%[src]] \n\t"
32 "moveq ip, #8 \n\t"
33 "mov %[keep_dst], %[dst] \n\t"
34
35 "add %[src], %[src], ip, LSL#2 \n\t"
36 "add %[dst], %[dst], ip, LSL#1 \n\t"
37 "subs %[count], %[count], ip \n\t"
38 "b 9f \n\t"
39 // LOOP
40 "2: \n\t"
41
42 "vld1.16 {q12}, [%[dst]]! \n\t"
43 "vld4.8 {d0-d3}, [%[src]]! \n\t"
44 "vst1.16 {q10}, [%[keep_dst]] \n\t"
45 "sub %[keep_dst], %[dst], #8*2 \n\t"
46 "subs %[count], %[count], #8 \n\t"
47 "9: \n\t"
48 "pld [%[dst],#32] \n\t"
49 // expand 0565 q12 to 8888 {d4-d7}
50 "vmovn.u16 d4, q12 \n\t"
51 "vshr.u16 q11, q12, #5 \n\t"
52 "vshr.u16 q10, q12, #6+5 \n\t"
53 "vmovn.u16 d5, q11 \n\t"
54 "vmovn.u16 d6, q10 \n\t"
55 "vshl.u8 d4, d4, #3 \n\t"
56 "vshl.u8 d5, d5, #2 \n\t"
57 "vshl.u8 d6, d6, #3 \n\t"
58
59 "vmovl.u8 q14, d31 \n\t"
60 "vmovl.u8 q13, d31 \n\t"
61 "vmovl.u8 q12, d31 \n\t"
62
63 // duplicate in 4/2/1 & 8pix vsns
64 "vmvn.8 d30, d3 \n\t"
65 "vmlal.u8 q14, d30, d6 \n\t"
66 "vmlal.u8 q13, d30, d5 \n\t"
67 "vmlal.u8 q12, d30, d4 \n\t"
68 "vshr.u16 q8, q14, #5 \n\t"
69 "vshr.u16 q9, q13, #6 \n\t"
70 "vaddhn.u16 d6, q14, q8 \n\t"
71 "vshr.u16 q8, q12, #5 \n\t"
72 "vaddhn.u16 d5, q13, q9 \n\t"
73 "vqadd.u8 d6, d6, d0 \n\t" // moved up
74 "vaddhn.u16 d4, q12, q8 \n\t"
75 // intentionally don't calculate alpha
76 // result in d4-d6
77
78 "vqadd.u8 d5, d5, d1 \n\t"
79 "vqadd.u8 d4, d4, d2 \n\t"
80
81 // pack 8888 {d4-d6} to 0565 q10
82 "vshll.u8 q10, d6, #8 \n\t"
83 "vshll.u8 q3, d5, #8 \n\t"
84 "vshll.u8 q2, d4, #8 \n\t"
85 "vsri.u16 q10, q3, #5 \n\t"
86 "vsri.u16 q10, q2, #11 \n\t"
87
88 "bne 2b \n\t"
89
90 "1: \n\t"
91 "vst1.16 {q10}, [%[keep_dst]] \n\t"
92 : [count] "+r" (count)
93 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
94 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
95 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
96 "d30","d31"
97 );
98 }
99 else
100 { // handle count < 8
101 uint16_t* SK_RESTRICT keep_dst;
102
103 asm volatile (
104 "vmov.u8 d31, #1<<7 \n\t"
105 "mov %[keep_dst], %[dst] \n\t"
106
107 "tst %[count], #4 \n\t"
108 "beq 14f \n\t"
109 "vld1.16 {d25}, [%[dst]]! \n\t"
110 "vld1.32 {q1}, [%[src]]! \n\t"
111
112 "14: \n\t"
113 "tst %[count], #2 \n\t"
114 "beq 12f \n\t"
115 "vld1.32 {d24[1]}, [%[dst]]! \n\t"
116 "vld1.32 {d1}, [%[src]]! \n\t"
117
118 "12: \n\t"
119 "tst %[count], #1 \n\t"
120 "beq 11f \n\t"
121 "vld1.16 {d24[1]}, [%[dst]]! \n\t"
122 "vld1.32 {d0[1]}, [%[src]]! \n\t"
123
124 "11: \n\t"
125 // unzips achieve the same as a vld4 operation
126 "vuzpq.u16 q0, q1 \n\t"
127 "vuzp.u8 d0, d1 \n\t"
128 "vuzp.u8 d2, d3 \n\t"
129 // expand 0565 q12 to 8888 {d4-d7}
130 "vmovn.u16 d4, q12 \n\t"
131 "vshr.u16 q11, q12, #5 \n\t"
132 "vshr.u16 q10, q12, #6+5 \n\t"
133 "vmovn.u16 d5, q11 \n\t"
134 "vmovn.u16 d6, q10 \n\t"
135 "vshl.u8 d4, d4, #3 \n\t"
136 "vshl.u8 d5, d5, #2 \n\t"
137 "vshl.u8 d6, d6, #3 \n\t"
138
139 "vmovl.u8 q14, d31 \n\t"
140 "vmovl.u8 q13, d31 \n\t"
141 "vmovl.u8 q12, d31 \n\t"
142
143 // duplicate in 4/2/1 & 8pix vsns
144 "vmvn.8 d30, d3 \n\t"
145 "vmlal.u8 q14, d30, d6 \n\t"
146 "vmlal.u8 q13, d30, d5 \n\t"
147 "vmlal.u8 q12, d30, d4 \n\t"
148 "vshr.u16 q8, q14, #5 \n\t"
149 "vshr.u16 q9, q13, #6 \n\t"
150 "vaddhn.u16 d6, q14, q8 \n\t"
151 "vshr.u16 q8, q12, #5 \n\t"
152 "vaddhn.u16 d5, q13, q9 \n\t"
153 "vqadd.u8 d6, d6, d0 \n\t" // moved up
154 "vaddhn.u16 d4, q12, q8 \n\t"
155 // intentionally don't calculate alpha
156 // result in d4-d6
157
158 "vqadd.u8 d5, d5, d1 \n\t"
159 "vqadd.u8 d4, d4, d2 \n\t"
160
161 // pack 8888 {d4-d6} to 0565 q10
162 "vshll.u8 q10, d6, #8 \n\t"
163 "vshll.u8 q3, d5, #8 \n\t"
164 "vshll.u8 q2, d4, #8 \n\t"
165 "vsri.u16 q10, q3, #5 \n\t"
166 "vsri.u16 q10, q2, #11 \n\t"
167
168 // store
169 "tst %[count], #4 \n\t"
170 "beq 24f \n\t"
171 "vst1.16 {d21}, [%[keep_dst]]! \n\t"
172
173 "24: \n\t"
174 "tst %[count], #2 \n\t"
175 "beq 22f \n\t"
176 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t"
177
178 "22: \n\t"
179 "tst %[count], #1 \n\t"
180 "beq 21f \n\t"
181 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t"
182
183 "21: \n\t"
184 : [count] "+r" (count)
185 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
186 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
187 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
188 "d30","d31"
189 );
190 }
191 }
192
S32A_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)193 static void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
194 const SkPMColor* SK_RESTRICT src, int count,
195 U8CPU alpha, int /*x*/, int /*y*/) {
196
197 U8CPU alpha_for_asm = alpha;
198
199 asm volatile (
200 /* This code implements a Neon version of S32A_D565_Blend. The output differs from
201 * the original in two respects:
202 * 1. The results have a few mismatches compared to the original code. These mismatches
203 * never exceed 1. It's possible to improve accuracy vs. a floating point
204 * implementation by introducing rounding right shifts (vrshr) for the final stage.
205 * Rounding is not present in the code below, because although results would be closer
206 * to a floating point implementation, the number of mismatches compared to the
207 * original code would be far greater.
208 * 2. On certain inputs, the original code can overflow, causing colour channels to
209 * mix. Although the Neon code can also overflow, it doesn't allow one colour channel
210 * to affect another.
211 */
212
213 #if 1
214 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
215 "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256
216 #else
217 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256
218 #endif
219 "vmov.u16 q3, #255 \n\t" // set up constant
220 "movs r4, %[count], lsr #3 \n\t" // calc. count>>3
221 "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon
222 "beq 2f \n\t" // if count8 == 0, exit
223 "vmov.u16 q15, #0x1f \n\t" // set up blue mask
224
225 "1: \n\t"
226 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels
227 "subs r4, r4, #1 \n\t" // decrement loop counter
228 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels
229 // and deinterleave
230
231 "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes
232 "vand q10, q0, q15 \n\t" // extract blue
233 "vshr.u16 q8, q0, #11 \n\t" // extract red
234 "vshr.u16 q9, q9, #10 \n\t" // extract green
235 // dstrgb = {q8, q9, q10}
236
237 "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range
238 "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range
239 "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range
240
241 "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits
242 "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits
243 "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits
244 "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits
245 // srcrgba = {q11, q12, q13, q14}
246
247 "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale
248 "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale
249 "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale
250 "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale
251
252 "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8
253 "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8)
254 // dst_scale = q2
255
256 "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale
257 "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale
258 "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale
259
260 #if 1
261 // trying for a better match with SkDiv255Round(a)
262 // C alg is: a+=128; (a+a>>8)>>8
263 // we'll use just a rounding shift [q2 is available for scratch]
264 "vrshr.u16 q11, q11, #8 \n\t" // shift down red
265 "vrshr.u16 q12, q12, #8 \n\t" // shift down green
266 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue
267 #else
268 // arm's original "truncating divide by 256"
269 "vshr.u16 q11, q11, #8 \n\t" // shift down red
270 "vshr.u16 q12, q12, #8 \n\t" // shift down green
271 "vshr.u16 q13, q13, #8 \n\t" // shift down blue
272 #endif
273
274 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue
275 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue
276 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr
277
278 "bne 1b \n\t" // if counter != 0, loop
279 "2: \n\t" // exit
280
281 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
282 :
283 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
284 );
285
286 count &= 7;
287 if (count > 0) {
288 do {
289 SkPMColor sc = *src++;
290 if (sc) {
291 uint16_t dc = *dst;
292 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
293 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
294 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
295 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
296 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
297 }
298 dst += 1;
299 } while (--count != 0);
300 }
301 }
302
303 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
304 * each dither value is spaced out into byte lanes, and repeated
305 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
306 * start of each row.
307 */
308 static const uint8_t gDitherMatrix_Neon[48] = {
309 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
310 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
311 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
312 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
313
314 };
315
S32_D565_Blend_Dither_neon(uint16_t * dst,const SkPMColor * src,int count,U8CPU alpha,int x,int y)316 static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
317 int count, U8CPU alpha, int x, int y)
318 {
319 /* select row and offset for dither array */
320 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
321
322 /* rescale alpha to range 0 - 256 */
323 int scale = SkAlpha255To256(alpha);
324
325 asm volatile (
326 "vld1.8 {d31}, [%[dstart]] \n\t" // load dither values
327 "vshr.u8 d30, d31, #1 \n\t" // calc. green dither values
328 "vdup.16 d6, %[scale] \n\t" // duplicate scale into neon reg
329 "vmov.i8 d29, #0x3f \n\t" // set up green mask
330 "vmov.i8 d28, #0x1f \n\t" // set up blue mask
331 "1: \n\t"
332 "vld4.8 {d0, d1, d2, d3}, [%[src]]! \n\t" // load 8 pixels and split into argb
333 "vshr.u8 d22, d0, #5 \n\t" // calc. red >> 5
334 "vshr.u8 d23, d1, #6 \n\t" // calc. green >> 6
335 "vshr.u8 d24, d2, #5 \n\t" // calc. blue >> 5
336 "vaddl.u8 q8, d0, d31 \n\t" // add in dither to red and widen
337 "vaddl.u8 q9, d1, d30 \n\t" // add in dither to green and widen
338 "vaddl.u8 q10, d2, d31 \n\t" // add in dither to blue and widen
339 "vsubw.u8 q8, q8, d22 \n\t" // sub shifted red from result
340 "vsubw.u8 q9, q9, d23 \n\t" // sub shifted green from result
341 "vsubw.u8 q10, q10, d24 \n\t" // sub shifted blue from result
342 "vshrn.i16 d22, q8, #3 \n\t" // shift right and narrow to 5 bits
343 "vshrn.i16 d23, q9, #2 \n\t" // shift right and narrow to 6 bits
344 "vshrn.i16 d24, q10, #3 \n\t" // shift right and narrow to 5 bits
345 // load 8 pixels from dst, extract rgb
346 "vld1.16 {d0, d1}, [%[dst]] \n\t" // load 8 pixels
347 "vshrn.i16 d17, q0, #5 \n\t" // shift green down to bottom 6 bits
348 "vmovn.i16 d18, q0 \n\t" // narrow to get blue as bytes
349 "vshr.u16 q0, q0, #11 \n\t" // shift down to extract red
350 "vand d17, d17, d29 \n\t" // and green with green mask
351 "vand d18, d18, d28 \n\t" // and blue with blue mask
352 "vmovn.i16 d16, q0 \n\t" // narrow to get red as bytes
353 // src = {d22 (r), d23 (g), d24 (b)}
354 // dst = {d16 (r), d17 (g), d18 (b)}
355 // subtract dst from src and widen
356 "vsubl.s8 q0, d22, d16 \n\t" // subtract red src from dst
357 "vsubl.s8 q1, d23, d17 \n\t" // subtract green src from dst
358 "vsubl.s8 q2, d24, d18 \n\t" // subtract blue src from dst
359 // multiply diffs by scale and shift
360 "vmul.i16 q0, q0, d6[0] \n\t" // multiply red by scale
361 "vmul.i16 q1, q1, d6[0] \n\t" // multiply blue by scale
362 "vmul.i16 q2, q2, d6[0] \n\t" // multiply green by scale
363 "subs %[count], %[count], #8 \n\t" // decrement loop counter
364 "vshrn.i16 d0, q0, #8 \n\t" // shift down red by 8 and narrow
365 "vshrn.i16 d2, q1, #8 \n\t" // shift down green by 8 and narrow
366 "vshrn.i16 d4, q2, #8 \n\t" // shift down blue by 8 and narrow
367 // add dst to result
368 "vaddl.s8 q0, d0, d16 \n\t" // add dst to red
369 "vaddl.s8 q1, d2, d17 \n\t" // add dst to green
370 "vaddl.s8 q2, d4, d18 \n\t" // add dst to blue
371 // put result into 565 format
372 "vsli.i16 q2, q1, #5 \n\t" // shift up green and insert into blue
373 "vsli.i16 q2, q0, #11 \n\t" // shift up red and insert into blue
374 "vst1.16 {d4, d5}, [%[dst]]! \n\t" // store result
375 "bgt 1b \n\t" // loop if count > 0
376 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
377 : [dstart] "r" (dstart), [scale] "r" (scale)
378 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
379 );
380
381 DITHER_565_SCAN(y);
382
383 while((count & 7) > 0)
384 {
385 SkPMColor c = *src++;
386
387 int dither = DITHER_VALUE(x);
388 int sr = SkGetPackedR32(c);
389 int sg = SkGetPackedG32(c);
390 int sb = SkGetPackedB32(c);
391 sr = SkDITHER_R32To565(sr, dither);
392 sg = SkDITHER_G32To565(sg, dither);
393 sb = SkDITHER_B32To565(sb, dither);
394
395 uint16_t d = *dst;
396 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
397 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
398 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
399 DITHER_INC_X(x);
400 count--;
401 }
402 }
403
404 #define S32A_D565_Opaque_PROC S32A_D565_Opaque_neon
405 #define S32A_D565_Blend_PROC S32A_D565_Blend_neon
406 #define S32_D565_Blend_Dither_PROC S32_D565_Blend_Dither_neon
407 #elif __ARM_ARCH__ >= 7 && !defined(SK_CPU_BENDIAN)
S32A_D565_Opaque_v7(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)408 static void S32A_D565_Opaque_v7(uint16_t* SK_RESTRICT dst,
409 const SkPMColor* SK_RESTRICT src, int count,
410 U8CPU alpha, int /*x*/, int /*y*/) {
411 SkASSERT(255 == alpha);
412
413 asm volatile (
414 "1: \n\t"
415 "ldr r3, [%[src]], #4 \n\t"
416 "cmp r3, #0xff000000 \n\t"
417 "blo 2f \n\t"
418 "and r4, r3, #0x0000f8 \n\t"
419 "and r5, r3, #0x00fc00 \n\t"
420 "and r6, r3, #0xf80000 \n\t"
421 "pld [r1, #32] \n\t"
422 "lsl r3, r4, #8 \n\t"
423 "orr r3, r3, r5, lsr #5 \n\t"
424 "orr r3, r3, r6, lsr #19 \n\t"
425 "subs %[count], %[count], #1 \n\t"
426 "strh r3, [%[dst]], #2 \n\t"
427 "bne 1b \n\t"
428 "b 4f \n\t"
429 "2: \n\t"
430 "lsrs r7, r3, #24 \n\t"
431 "beq 3f \n\t"
432 "ldrh r4, [%[dst]] \n\t"
433 "rsb r7, r7, #255 \n\t"
434 "and r6, r4, #0x001f \n\t"
435 "ubfx r5, r4, #5, #6 \n\t"
436 "pld [r0, #16] \n\t"
437 "lsr r4, r4, #11 \n\t"
438 "smulbb r6, r6, r7 \n\t"
439 "smulbb r5, r5, r7 \n\t"
440 "smulbb r4, r4, r7 \n\t"
441 "ubfx r7, r3, #16, #8 \n\t"
442 "ubfx ip, r3, #8, #8 \n\t"
443 "and r3, r3, #0xff \n\t"
444 "add r6, r6, #16 \n\t"
445 "add r5, r5, #32 \n\t"
446 "add r4, r4, #16 \n\t"
447 "add r6, r6, r6, lsr #5 \n\t"
448 "add r5, r5, r5, lsr #6 \n\t"
449 "add r4, r4, r4, lsr #5 \n\t"
450 "add r6, r7, r6, lsr #5 \n\t"
451 "add r5, ip, r5, lsr #6 \n\t"
452 "add r4, r3, r4, lsr #5 \n\t"
453 "lsr r6, r6, #3 \n\t"
454 "and r5, r5, #0xfc \n\t"
455 "and r4, r4, #0xf8 \n\t"
456 "orr r6, r6, r5, lsl #3 \n\t"
457 "orr r4, r6, r4, lsl #8 \n\t"
458 "strh r4, [%[dst]], #2 \n\t"
459 "pld [r1, #32] \n\t"
460 "subs %[count], %[count], #1 \n\t"
461 "bne 1b \n\t"
462 "b 4f \n\t"
463 "3: \n\t"
464 "subs %[count], %[count], #1 \n\t"
465 "add %[dst], %[dst], #2 \n\t"
466 "bne 1b \n\t"
467 "4: \n\t"
468 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
469 :
470 : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "ip"
471 );
472 }
473 #define S32A_D565_Opaque_PROC S32A_D565_Opaque_v7
474 #define S32A_D565_Blend_PROC NULL
475 #define S32_D565_Blend_Dither_PROC NULL
476 #else
477 #define S32A_D565_Opaque_PROC NULL
478 #define S32A_D565_Blend_PROC NULL
479 #define S32_D565_Blend_Dither_PROC NULL
480 #endif
481
482 /* Don't have a special version that assumes each src is opaque, but our S32A
483 is still faster than the default, so use it here
484 */
485 #define S32_D565_Opaque_PROC S32A_D565_Opaque_PROC
486 #define S32_D565_Blend_PROC S32A_D565_Blend_PROC
487
488 ///////////////////////////////////////////////////////////////////////////////
489
490 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) && defined(TEST_SRC_ALPHA)
491
S32A_Opaque_BlitRow32_neon_test_alpha(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)492 static void S32A_Opaque_BlitRow32_neon_test_alpha(SkPMColor* SK_RESTRICT dst,
493 const SkPMColor* SK_RESTRICT src,
494 int count, U8CPU alpha) {
495 SkASSERT(255 == alpha);
496 if (count <= 0)
497 return;
498
499 /* Use these to check if src is transparent or opaque */
500 const unsigned int ALPHA_OPAQ = 0xFF000000;
501 const unsigned int ALPHA_TRANS = 0x00FFFFFF;
502
503 #define UNROLL 4
504 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
505 const SkPMColor* SK_RESTRICT src_temp = src;
506
507 /* set up the NEON variables */
508 uint8x8_t alpha_mask;
509 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
510 alpha_mask = vld1_u8(alpha_mask_setup);
511
512 uint8x8_t src_raw, dst_raw, dst_final;
513 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
514 uint8x8_t dst_cooked;
515 uint16x8_t dst_wide;
516 uint8x8_t alpha_narrow;
517 uint16x8_t alpha_wide;
518
519 /* choose the first processing type */
520 if( src >= src_end)
521 goto TAIL;
522 if(*src <= ALPHA_TRANS)
523 goto ALPHA_0;
524 if(*src >= ALPHA_OPAQ)
525 goto ALPHA_255;
526 /* fall-thru */
527
528 ALPHA_1_TO_254:
529 do {
530
531 /* get the source */
532 src_raw = vreinterpret_u8_u32(vld1_u32(src));
533 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
534
535 /* get and hold the dst too */
536 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
537 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
538
539
540 /* get the alphas spread out properly */
541 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
542 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
543 /* we collapsed (255-a)+1 ... */
544 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
545
546 /* spread the dest */
547 dst_wide = vmovl_u8(dst_raw);
548
549 /* alpha mul the dest */
550 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
551 dst_cooked = vshrn_n_u16(dst_wide, 8);
552
553 /* sum -- ignoring any byte lane overflows */
554 dst_final = vadd_u8(src_raw, dst_cooked);
555
556 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
557 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
558 /* we collapsed (255-a)+1 ... */
559 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
560
561 /* spread the dest */
562 dst_wide = vmovl_u8(dst_raw_2);
563
564 /* alpha mul the dest */
565 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
566 dst_cooked = vshrn_n_u16(dst_wide, 8);
567
568 /* sum -- ignoring any byte lane overflows */
569 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
570
571 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
572 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
573
574 src += UNROLL;
575 dst += UNROLL;
576
577 /* if 2 of the next pixels aren't between 1 and 254
578 it might make sense to go to the optimized loops */
579 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
580 break;
581
582 } while(src < src_end);
583
584 if (src >= src_end)
585 goto TAIL;
586
587 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
588 goto ALPHA_255;
589
590 /*fall-thru*/
591
592 ALPHA_0:
593
594 /*In this state, we know the current alpha is 0 and
595 we optimize for the next alpha also being zero. */
596 src_temp = src; //so we don't have to increment dst every time
597 do {
598 if(*(++src) > ALPHA_TRANS)
599 break;
600 if(*(++src) > ALPHA_TRANS)
601 break;
602 if(*(++src) > ALPHA_TRANS)
603 break;
604 if(*(++src) > ALPHA_TRANS)
605 break;
606 } while(src < src_end);
607
608 dst += (src - src_temp);
609
610 /* no longer alpha 0, so determine where to go next. */
611 if( src >= src_end)
612 goto TAIL;
613 if(*src >= ALPHA_OPAQ)
614 goto ALPHA_255;
615 else
616 goto ALPHA_1_TO_254;
617
618 ALPHA_255:
619 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
620 dst[0]=src[0];
621 dst[1]=src[1];
622 dst[2]=src[2];
623 dst[3]=src[3];
624 src+=UNROLL;
625 dst+=UNROLL;
626 if(src >= src_end)
627 goto TAIL;
628 }
629
630 //Handle remainder.
631 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
632 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
633 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
634 }
635 }
636
637 if( src >= src_end)
638 goto TAIL;
639 if(*src <= ALPHA_TRANS)
640 goto ALPHA_0;
641 else
642 goto ALPHA_1_TO_254;
643
644 TAIL:
645 /* do any residual iterations */
646 src_end += UNROLL + 1; //goto the real end
647 while(src != src_end) {
648 if( *src != 0 ) {
649 if( *src >= ALPHA_OPAQ ) {
650 *dst = *src;
651 }
652 else {
653 *dst = SkPMSrcOver(*src, *dst);
654 }
655 }
656 src++;
657 dst++;
658 }
659 return;
660 }
661
662 #define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_neon_test_alpha
663
664 #elif defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
665
S32A_Opaque_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)666 static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
667 const SkPMColor* SK_RESTRICT src,
668 int count, U8CPU alpha) {
669
670 SkASSERT(255 == alpha);
671 if (count > 0) {
672
673
674 uint8x8_t alpha_mask;
675
676 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
677 alpha_mask = vld1_u8(alpha_mask_setup);
678
679 /* do the NEON unrolled code */
680 #define UNROLL 4
681 while (count >= UNROLL) {
682 uint8x8_t src_raw, dst_raw, dst_final;
683 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
684
685 /* get the source */
686 src_raw = vreinterpret_u8_u32(vld1_u32(src));
687 #if UNROLL > 2
688 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
689 #endif
690
691 /* get and hold the dst too */
692 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
693 #if UNROLL > 2
694 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
695 #endif
696
697 /* 1st and 2nd bits of the unrolling */
698 {
699 uint8x8_t dst_cooked;
700 uint16x8_t dst_wide;
701 uint8x8_t alpha_narrow;
702 uint16x8_t alpha_wide;
703
704 /* get the alphas spread out properly */
705 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
706 #if 1
707 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
708 /* we collapsed (255-a)+1 ... */
709 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
710 #else
711 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
712 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
713 #endif
714
715 /* spread the dest */
716 dst_wide = vmovl_u8(dst_raw);
717
718 /* alpha mul the dest */
719 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
720 dst_cooked = vshrn_n_u16(dst_wide, 8);
721
722 /* sum -- ignoring any byte lane overflows */
723 dst_final = vadd_u8(src_raw, dst_cooked);
724 }
725
726 #if UNROLL > 2
727 /* the 3rd and 4th bits of our unrolling */
728 {
729 uint8x8_t dst_cooked;
730 uint16x8_t dst_wide;
731 uint8x8_t alpha_narrow;
732 uint16x8_t alpha_wide;
733
734 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
735 #if 1
736 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
737 /* we collapsed (255-a)+1 ... */
738 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
739 #else
740 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
741 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
742 #endif
743
744 /* spread the dest */
745 dst_wide = vmovl_u8(dst_raw_2);
746
747 /* alpha mul the dest */
748 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
749 dst_cooked = vshrn_n_u16(dst_wide, 8);
750
751 /* sum -- ignoring any byte lane overflows */
752 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
753 }
754 #endif
755
756 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
757 #if UNROLL > 2
758 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
759 #endif
760
761 src += UNROLL;
762 dst += UNROLL;
763 count -= UNROLL;
764 }
765 #undef UNROLL
766
767 /* do any residual iterations */
768 while (--count >= 0) {
769 #ifdef TEST_SRC_ALPHA
770 SkPMColor sc = *src;
771 if (sc) {
772 unsigned srcA = SkGetPackedA32(sc);
773 SkPMColor result = sc;
774 if (srcA != 255) {
775 result = SkPMSrcOver(sc, *dst);
776 }
777 *dst = result;
778 }
779 #else
780 *dst = SkPMSrcOver(*src, *dst);
781 #endif
782 src += 1;
783 dst += 1;
784 }
785 }
786 }
787
788 #define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_neon
789
790 #elif defined (__ARM_ARCH__) /* #if defined(__ARM_HAVE_NEON) && defined... */
791
792 #if defined(TEST_SRC_ALPHA)
793
S32A_Opaque_BlitRow32_arm_test_alpha(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)794 static void __attribute__((naked)) S32A_Opaque_BlitRow32_arm_test_alpha
795 (SkPMColor* SK_RESTRICT dst,
796 const SkPMColor* SK_RESTRICT src,
797 int count, U8CPU alpha) {
798
799 /* Optimizes for alpha == 0, alpha == 255, and 1 < alpha < 255 cases individually */
800 /* Predicts that the next pixel will have the same alpha type as the current pixel */
801
802 asm volatile (
803
804 "\tSTMDB r13!, {r4-r12, r14} \n" /* saving r4-r12, lr on the stack */
805 /* we should not save r0-r3 according to ABI */
806
807 "\tCMP r2, #0 \n" /* if (count == 0) */
808 "\tBEQ 9f \n" /* go to EXIT */
809
810 "\tMOV r12, #0xff \n" /* load the 0xff mask in r12 */
811 "\tORR r12, r12, r12, LSL #16 \n" /* convert it to 0xff00ff in r12 */
812
813 "\tMOV r14, #255 \n" /* r14 = 255 */
814 /* will be used later for left-side comparison */
815
816 "\tADD r2, %[src], r2, LSL #2 \n" /* r2 points to last array element which can be used */
817 "\tSUB r2, r2, #16 \n" /* as a base for 4-way processing algorithm */
818
819 "\tCMP %[src], r2 \n" /* if our current [src] array pointer is bigger than */
820 "\tBGT 8f \n" /* calculated marker for 4-way -> */
821 /* use simple one-by-one processing */
822
823 /* START OF DISPATCHING BLOCK */
824
825 "\t0: \n"
826
827 "\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */
828
829 "\tLSR r7, r3, #24 \n" /* if not all src alphas of 4-way block are equal -> */
830 "\tCMP r7, r4, LSR #24 \n"
831 "\tCMPEQ r7, r5, LSR #24 \n"
832 "\tCMPEQ r7, r6, LSR #24 \n"
833 "\tBNE 1f \n" /* -> go to general 4-way processing routine */
834
835 "\tCMP r14, r7 \n" /* if all src alphas are equal to 255 */
836 "\tBEQ 3f \n" /* go to alpha == 255 optimized routine */
837
838 "\tCMP r7, #0 \n" /* if all src alphas are equal to 0 */
839 "\tBEQ 6f \n" /* go to alpha == 0 optimized routine */
840
841 /* END OF DISPATCHING BLOCK */
842
843 /* START OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */
844
845 "\t1: \n"
846 /* we do not have enough registers to make */
847 /* 4-way [dst] loading -> we are using 2 * 2-way */
848
849 "\tLDM %[dst], {r7, r8} \n" /* 1st 2-way loading of dst values to r7-r8 */
850
851 /* PROCESSING BLOCK 1 */
852 /* r3 = src, r7 = dst */
853
854 "\tLSR r11, r3, #24 \n" /* extracting alpha from source and storing to r11 */
855 "\tAND r9, r12, r7 \n" /* r9 = br masked by r12 (0xff00ff) */
856 "\tRSB r11, r11, #256 \n" /* subtracting the alpha from 255 -> r11 = scale */
857 "\tAND r10, r12, r7, LSR #8 \n" /* r10 = ag masked by r12 (0xff00ff) */
858 "\tMUL r9, r9, r11 \n" /* br = br * scale */
859 "\tAND r9, r12, r9, LSR #8 \n" /* lsr br by 8 and mask it */
860 "\tMUL r10, r10, r11 \n" /* ag = ag * scale */
861 "\tAND r10, r10, r12, LSL #8 \n" /* mask ag with reverse mask */
862 "\tORR r7, r9, r10 \n" /* br | ag */
863 "\tADD r7, r3, r7 \n" /* dst = src + calc dest(r8) */
864
865 /* PROCESSING BLOCK 2 */
866 /* r4 = src, r8 = dst */
867
868 "\tLSR r11, r4, #24 \n" /* see PROCESSING BLOCK 1 */
869 "\tAND r9, r12, r8 \n"
870 "\tRSB r11, r11, #256 \n"
871 "\tAND r10, r12, r8, LSR #8 \n"
872 "\tMUL r9, r9, r11 \n"
873 "\tAND r9, r12, r9, LSR #8 \n"
874 "\tMUL r10, r10, r11 \n"
875 "\tAND r10, r10, r12, LSL #8 \n"
876 "\tORR r8, r9, r10 \n"
877 "\tADD r8, r4, r8 \n"
878
879 "\tSTM %[dst]!, {r7, r8} \n" /* 1st 2-way storing of processed dst values */
880
881 "\tLDM %[dst], {r9, r10} \n" /* 2nd 2-way loading of dst values to r9-r10 */
882
883 /* PROCESSING BLOCK 3 */
884 /* r5 = src, r9 = dst */
885
886 "\tLSR r11, r5, #24 \n" /* see PROCESSING BLOCK 1 */
887 "\tAND r7, r12, r9 \n"
888 "\tRSB r11, r11, #256 \n"
889 "\tAND r8, r12, r9, LSR #8 \n"
890 "\tMUL r7, r7, r11 \n"
891 "\tAND r7, r12, r7, LSR #8 \n"
892 "\tMUL r8, r8, r11 \n"
893 "\tAND r8, r8, r12, LSL #8 \n"
894 "\tORR r9, r7, r8 \n"
895 "\tADD r9, r5, r9 \n"
896
897 /* PROCESSING BLOCK 4 */
898 /* r6 = src, r10 = dst */
899
900 "\tLSR r11, r6, #24 \n" /* see PROCESSING BLOCK 1 */
901 "\tAND r7, r12, r10 \n"
902 "\tRSB r11, r11, #256 \n"
903 "\tAND r8, r12, r10, LSR #8 \n"
904 "\tMUL r7, r7, r11 \n"
905 "\tAND r7, r12, r7, LSR #8 \n"
906 "\tMUL r8, r8, r11 \n"
907 "\tAND r8, r8, r12, LSL #8 \n"
908 "\tORR r10, r7, r8 \n"
909 "\tADD r10, r6, r10 \n"
910
911 "\tSTM %[dst]!, {r9, r10} \n" /* 2nd 2-way storing of processed dst values */
912
913 "\tCMP %[src], r2 \n" /* if our current [src] pointer <= calculated marker */
914 "\tBLE 0b \n" /* we could run 4-way processing -> go to dispatcher */
915 "\tBGT 8f \n" /* else -> use simple one-by-one processing */
916
917 /* END OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */
918
919 /* START OF BLOCK OPTIMIZED FOR ALPHA == 255 */
920
921 "\t2: \n" /* ENTRY 1: LOADING [src] to registers */
922
923 "\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */
924
925 "\tAND r7, r3, r4 \n" /* if not all alphas == 255 -> */
926 "\tAND r8, r5, r6 \n"
927 "\tAND r9, r7, r8 \n"
928 "\tCMP r14, r9, LSR #24 \n"
929 "\tBNE 4f \n" /* -> go to alpha == 0 check */
930
931 "\t3: \n" /* ENTRY 2: [src] already loaded by DISPATCHER */
932
933 "\tSTM %[dst]!, {r3, r4, r5, r6} \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */
934
935 "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */
936 "\tBLE 2b \n" /* we could run 4-way processing */
937 /* because now we're in ALPHA == 255 state */
938 /* run next cycle with priority alpha == 255 checks */
939
940 "\tBGT 8f \n" /* if our current [src] array pointer > marker */
941 /* use simple one-by-one processing */
942
943 "\t4: \n"
944
945 "\tORR r7, r3, r4 \n" /* if not all alphas == 0 -> */
946 "\tORR r8, r5, r6 \n"
947 "\tORR r9, r7, r8 \n"
948 "\tLSRS r9, #24 \n"
949 "\tBNE 1b \n" /* -> go to general processing mode */
950 /* (we already checked for alpha == 255) */
951
952 "\tADD %[dst], %[dst], #16 \n" /* all src alphas == 0 -> do not change dst values */
953
954 "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */
955 "\tBLE 5f \n" /* we could run 4-way processing one more time */
956 /* because now we're in ALPHA == 0 state */
957 /* run next cycle with priority alpha == 0 checks */
958
959 "\tBGT 8f \n" /* if our current [src] array pointer > marker */
960 /* use simple one-by-one processing */
961
962 /* END OF BLOCK OPTIMIZED FOR ALPHA == 255 */
963
964 /* START OF BLOCK OPTIMIZED FOR ALPHA == 0 */
965
966 "\t5: \n" /* ENTRY 1: LOADING [src] to registers */
967
968 "\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */
969
970 "\tORR r7, r3, r4 \n" /* if not all alphas == 0 -> */
971 "\tORR r8, r5, r6 \n"
972 "\tORR r9, r7, r8 \n"
973 "\tLSRS r9, #24 \n"
974 "\tBNE 7f \n" /* -> go to alpha == 255 check */
975
976 "\t6: \n" /* ENTRY 2: [src] already loaded by DISPATCHER */
977
978 "\tADD %[dst], %[dst], #16 \n" /* all src alphas == 0 -> do not change dst values */
979
980 "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */
981 "\tBLE 5b \n" /* we could run 4-way processing one more time */
982 /* because now we're in ALPHA == 0 state */
983 /* run next cycle with priority alpha == 0 checks */
984
985 "\tBGT 8f \n" /* if our current [src] array pointer > marker */
986 /* use simple one-by-one processing */
987 "\t7: \n"
988
989 "\tAND r7, r3, r4 \n" /* if not all alphas == 255 -> */
990 "\tAND r8, r5, r6 \n"
991 "\tAND r9, r7, r8 \n"
992 "\tCMP r14, r9, LSR #24 \n"
993 "\tBNE 1b \n" /* -> go to general processing mode */
994 /* (we already checked for alpha == 0) */
995
996 "\tSTM %[dst]!, {r3, r4, r5, r6} \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */
997
998 "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */
999 "\tBLE 2b \n" /* we could run 4-way processing one more time */
1000 /* because now we're in ALPHA == 255 state */
1001 /* run next cycle with priority alpha == 255 checks */
1002
1003 "\tBGT 8f \n" /* if our current [src] array pointer > marker */
1004 /* use simple one-by-one processing */
1005
1006 /* END OF BLOCK OPTIMIZED FOR ALPHA == 0 */
1007
1008 /* START OF TAIL BLOCK */
1009 /* (used when array is too small to be processed with 4-way algorithm)*/
1010
1011 "\t8: \n"
1012
1013 "\tADD r2, r2, #16 \n" /* now r2 points to the element just after array */
1014 /* we've done r2 = r2 - 16 at procedure start */
1015
1016 "\tCMP %[src], r2 \n" /* if our current [src] array pointer > final marker */
1017 "\tBEQ 9f \n" /* goto EXIT */
1018
1019 /* TAIL PROCESSING BLOCK 1 */
1020
1021 "\tLDR r3, [%[src]], #4 \n" /* r3 = *src, src++ */
1022 "\tLDR r7, [%[dst]] \n" /* r7 = *dst */
1023
1024 "\tLSR r11, r3, #24 \n" /* extracting alpha from source */
1025 "\tAND r9, r12, r7 \n" /* r9 = br masked by r12 (0xff00ff) */
1026 "\tRSB r11, r11, #256 \n" /* subtracting the alpha from 255 -> r11 = scale */
1027 "\tAND r10, r12, r7, LSR #8 \n" /* r10 = ag masked by r12 (0xff00ff) */
1028 "\tMUL r9, r9, r11 \n" /* br = br * scale */
1029 "\tAND r9, r12, r9, LSR #8 \n" /* lsr br by 8 and mask it */
1030 "\tMUL r10, r10, r11 \n" /* ag = ag * scale */
1031 "\tAND r10, r10, r12, LSL #8 \n" /* mask ag with reverse mask */
1032 "\tORR r7, r9, r10 \n" /* br | ag */
1033 "\tADD r7, r3, r7 \n" /* dst = src + calc dest(r8) */
1034
1035 "\tSTR r7, [%[dst]], #4 \n" /* *dst = r7; dst++ */
1036
1037 "\tCMP %[src], r2 \n" /* if our current [src] array pointer > final marker */
1038 "\tBEQ 9f \n" /* goto EXIT */
1039
1040 /* TAIL PROCESSING BLOCK 2 */
1041
1042 "\tLDR r3, [%[src]], #4 \n" /* see TAIL PROCESSING BLOCK 1 */
1043 "\tLDR r7, [%[dst]] \n"
1044
1045 "\tLSR r11, r3, #24 \n"
1046 "\tAND r9, r12, r7 \n"
1047 "\tRSB r11, r11, #256 \n"
1048 "\tAND r10, r12, r7, LSR #8 \n"
1049 "\tMUL r9, r9, r11 \n"
1050 "\tAND r9, r12, r9, LSR #8 \n"
1051 "\tMUL r10, r10, r11 \n"
1052 "\tAND r10, r10, r12, LSL #8 \n"
1053 "\tORR r7, r9, r10 \n"
1054 "\tADD r7, r3, r7 \n"
1055
1056 "\tSTR r7, [%[dst]], #4 \n"
1057
1058 "\tCMP %[src], r2 \n"
1059 "\tBEQ 9f \n"
1060
1061 /* TAIL PROCESSING BLOCK 3 */
1062
1063 "\tLDR r3, [%[src]], #4 \n" /* see TAIL PROCESSING BLOCK 1 */
1064 "\tLDR r7, [%[dst]] \n"
1065
1066 "\tLSR r11, r3, #24 \n"
1067 "\tAND r9, r12, r7 \n"
1068 "\tRSB r11, r11, #256 \n"
1069 "\tAND r10, r12, r7, LSR #8 \n"
1070 "\tMUL r9, r9, r11 \n"
1071 "\tAND r9, r12, r9, LSR #8 \n"
1072 "\tMUL r10, r10, r11 \n"
1073 "\tAND r10, r10, r12, LSL #8 \n"
1074 "\tORR r7, r9, r10 \n"
1075 "\tADD r7, r3, r7 \n"
1076
1077 "\tSTR r7, [%[dst]], #4 \n"
1078
1079 /* END OF TAIL BLOCK */
1080
1081 "\t9: \n" /* EXIT */
1082
1083 "\tLDMIA r13!, {r4-r12, r14} \n" /* restoring r4-r12, lr from stack */
1084 "\tBX lr \n" /* return */
1085
1086 : [dst] "+r" (dst), [src] "+r" (src)
1087 :
1088 : "cc", "r2", "r3", "memory"
1089
1090 );
1091
1092 }
1093
1094 #define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_arm_test_alpha
1095 #else /* !defined(TEST_SRC_ALPHA) */
1096
S32A_Opaque_BlitRow32_arm(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)1097 static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
1098 const SkPMColor* SK_RESTRICT src,
1099 int count, U8CPU alpha) {
1100
1101 SkASSERT(255 == alpha);
1102
1103 /* Does not support the TEST_SRC_ALPHA case */
1104 asm volatile (
1105 "cmp %[count], #0 \n\t" /* comparing count with 0 */
1106 "beq 3f \n\t" /* if zero exit */
1107
1108 "mov ip, #0xff \n\t" /* load the 0xff mask in ip */
1109 "orr ip, ip, ip, lsl #16 \n\t" /* convert it to 0xff00ff in ip */
1110
1111 "cmp %[count], #2 \n\t" /* compare count with 2 */
1112 "blt 2f \n\t" /* if less than 2 -> single loop */
1113
1114 /* Double Loop */
1115 "1: \n\t" /* <double loop> */
1116 "ldm %[src]!, {r5,r6} \n\t" /* load the src(s) at r5-r6 */
1117 "ldm %[dst], {r7,r8} \n\t" /* loading dst(s) into r7-r8 */
1118 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */
1119
1120 /* ----------- */
1121 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */
1122 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */
1123 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */
1124
1125 "mul r9, r9, r4 \n\t" /* br = br * scale */
1126 "mul r10, r10, r4 \n\t" /* ag = ag * scale */
1127 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */
1128
1129 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */
1130 "lsr r4, r6, #24 \n\t" /* extracting the alpha from source and storing it to r4 */
1131 "orr r7, r9, r10 \n\t" /* br | ag*/
1132
1133 "add r7, r5, r7 \n\t" /* dst = src + calc dest(r7) */
1134 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 255 -> r4=scale */
1135
1136 /* ----------- */
1137 "and r9, ip, r8 \n\t" /* r9 = br masked by ip */
1138
1139 "and r10, ip, r8, lsr #8 \n\t" /* r10 = ag masked by ip */
1140 "mul r9, r9, r4 \n\t" /* br = br * scale */
1141 "sub %[count], %[count], #2 \n\t"
1142 "mul r10, r10, r4 \n\t" /* ag = ag * scale */
1143
1144 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */
1145 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */
1146 "cmp %[count], #1 \n\t" /* comparing count with 1 */
1147 "orr r8, r9, r10 \n\t" /* br | ag */
1148
1149 "add r8, r6, r8 \n\t" /* dst = src + calc dest(r8) */
1150
1151 /* ----------------- */
1152 "stm %[dst]!, {r7,r8} \n\t" /* *dst = r7, increment dst by two (each times 4) */
1153 /* ----------------- */
1154
1155 "bgt 1b \n\t" /* if greater than 1 -> reloop */
1156 "blt 3f \n\t" /* if less than 1 -> exit */
1157
1158 /* Single Loop */
1159 "2: \n\t" /* <single loop> */
1160 "ldr r5, [%[src]], #4 \n\t" /* load the src pointer into r5 r5=src */
1161 "ldr r7, [%[dst]] \n\t" /* loading dst into r7 */
1162 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */
1163
1164 /* ----------- */
1165 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */
1166 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */
1167
1168 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */
1169 "mul r9, r9, r4 \n\t" /* br = br * scale */
1170 "mul r10, r10, r4 \n\t" /* ag = ag * scale */
1171 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */
1172
1173 "and r10, r10, ip, lsl #8 \n\t" /* mask ag */
1174 "orr r7, r9, r10 \n\t" /* br | ag */
1175
1176 "add r7, r5, r7 \n\t" /* *dst = src + calc dest(r7) */
1177
1178 /* ----------------- */
1179 "str r7, [%[dst]], #4 \n\t" /* *dst = r7, increment dst by one (times 4) */
1180 /* ----------------- */
1181
1182 "3: \n\t" /* <exit> */
1183 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
1184 :
1185 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory"
1186 );
1187 }
1188 #define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_arm
1189 #endif /* !defined(TEST_SRC_ALPHA) */
1190 #else /* ... #elif defined (__ARM_ARCH__) */
1191 #define S32A_Opaque_BlitRow32_PROC NULL
1192 #endif
1193
1194 /*
1195 * ARM asm version of S32A_Blend_BlitRow32
1196 */
S32A_Blend_BlitRow32_arm(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)1197 static void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
1198 const SkPMColor* SK_RESTRICT src,
1199 int count, U8CPU alpha) {
1200 asm volatile (
1201 "cmp %[count], #0 \n\t" /* comparing count with 0 */
1202 "beq 3f \n\t" /* if zero exit */
1203
1204 "mov r12, #0xff \n\t" /* load the 0xff mask in r12 */
1205 "orr r12, r12, r12, lsl #16 \n\t" /* convert it to 0xff00ff in r12 */
1206
1207 /* src1,2_scale */
1208 "add %[alpha], %[alpha], #1 \n\t" /* loading %[alpha]=src_scale=alpha+1 */
1209
1210 "cmp %[count], #2 \n\t" /* comparing count with 2 */
1211 "blt 2f \n\t" /* if less than 2 -> single loop */
1212
1213 /* Double Loop */
1214 "1: \n\t" /* <double loop> */
1215 "ldm %[src]!, {r5, r6} \n\t" /* loading src pointers into r5 and r6 */
1216 "ldm %[dst], {r7, r8} \n\t" /* loading dst pointers into r7 and r8 */
1217
1218 /* dst1_scale and dst2_scale*/
1219 "lsr r9, r5, #24 \n\t" /* src >> 24 */
1220 "lsr r10, r6, #24 \n\t" /* src >> 24 */
1221 "smulbb r9, r9, %[alpha] \n\t" /* r9 = SkMulS16 r9 with src_scale */
1222 "smulbb r10, r10, %[alpha] \n\t" /* r10 = SkMulS16 r10 with src_scale */
1223 "lsr r9, r9, #8 \n\t" /* r9 >> 8 */
1224 "lsr r10, r10, #8 \n\t" /* r10 >> 8 */
1225 "rsb r9, r9, #256 \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */
1226 "rsb r10, r10, #256 \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */
1227
1228 /* ---------------------- */
1229
1230 /* src1, src1_scale */
1231 "and r11, r12, r5, lsr #8 \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */
1232 "and r4, r12, r5 \n\t" /* rb = r4 = r5 masked by r12 */
1233 "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */
1234 "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */
1235 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
1236 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */
1237 "orr r5, r11, r4 \n\t" /* r5 = (src1, src_scale) */
1238
1239 /* dst1, dst1_scale */
1240 "and r11, r12, r7, lsr #8 \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */
1241 "and r4, r12, r7 \n\t" /* rb = r4 = r7 masked by r12 */
1242 "mul r11, r11, r9 \n\t" /* ag = r11 times dst_scale (r9) */
1243 "mul r4, r4, r9 \n\t" /* rb = r4 times dst_scale (r9) */
1244 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
1245 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */
1246 "orr r9, r11, r4 \n\t" /* r9 = (dst1, dst_scale) */
1247
1248 /* ---------------------- */
1249 "add r9, r5, r9 \n\t" /* *dst = src plus dst both scaled */
1250 /* ---------------------- */
1251
1252 /* ====================== */
1253
1254 /* src2, src2_scale */
1255 "and r11, r12, r6, lsr #8 \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */
1256 "and r4, r12, r6 \n\t" /* rb = r4 = r6 masked by r12 */
1257 "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */
1258 "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */
1259 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
1260 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */
1261 "orr r6, r11, r4 \n\t" /* r6 = (src2, src_scale) */
1262
1263 /* dst2, dst2_scale */
1264 "and r11, r12, r8, lsr #8 \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */
1265 "and r4, r12, r8 \n\t" /* rb = r4 = r8 masked by r12 */
1266 "mul r11, r11, r10 \n\t" /* ag = r11 times dst_scale (r10) */
1267 "mul r4, r4, r10 \n\t" /* rb = r4 times dst_scale (r6) */
1268 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
1269 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */
1270 "orr r10, r11, r4 \n\t" /* r10 = (dst2, dst_scale) */
1271
1272 "sub %[count], %[count], #2 \n\t" /* decrease count by 2 */
1273 /* ---------------------- */
1274 "add r10, r6, r10 \n\t" /* *dst = src plus dst both scaled */
1275 /* ---------------------- */
1276 "cmp %[count], #1 \n\t" /* compare count with 1 */
1277 /* ----------------- */
1278 "stm %[dst]!, {r9, r10} \n\t" /* copy r9 and r10 to r7 and r8 respectively */
1279 /* ----------------- */
1280
1281 "bgt 1b \n\t" /* if %[count] greater than 1 reloop */
1282 "blt 3f \n\t" /* if %[count] less than 1 exit */
1283 /* else get into the single loop */
1284 /* Single Loop */
1285 "2: \n\t" /* <single loop> */
1286 "ldr r5, [%[src]], #4 \n\t" /* loading src pointer into r5: r5=src */
1287 "ldr r7, [%[dst]] \n\t" /* loading dst pointer into r7: r7=dst */
1288
1289 "lsr r6, r5, #24 \n\t" /* src >> 24 */
1290 "and r8, r12, r5, lsr #8 \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */
1291 "smulbb r6, r6, %[alpha] \n\t" /* r6 = SkMulS16 with src_scale */
1292 "and r9, r12, r5 \n\t" /* rb = r9 = r5 masked by r12 */
1293 "lsr r6, r6, #8 \n\t" /* r6 >> 8 */
1294 "mul r8, r8, %[alpha] \n\t" /* ag = r8 times scale */
1295 "rsb r6, r6, #256 \n\t" /* r6 = 255 - r6 + 1 */
1296
1297 /* src, src_scale */
1298 "mul r9, r9, %[alpha] \n\t" /* rb = r9 times scale */
1299 "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
1300 "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */
1301 "orr r10, r8, r9 \n\t" /* r10 = (scr, src_scale) */
1302
1303 /* dst, dst_scale */
1304 "and r8, r12, r7, lsr #8 \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */
1305 "and r9, r12, r7 \n\t" /* rb = r9 = r7 masked by r12 */
1306 "mul r8, r8, r6 \n\t" /* ag = r8 times scale (r6) */
1307 "mul r9, r9, r6 \n\t" /* rb = r9 times scale (r6) */
1308 "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
1309 "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */
1310 "orr r7, r8, r9 \n\t" /* r7 = (dst, dst_scale) */
1311
1312 "add r10, r7, r10 \n\t" /* *dst = src plus dst both scaled */
1313
1314 /* ----------------- */
1315 "str r10, [%[dst]], #4 \n\t" /* *dst = r10, postincrement dst by one (times 4) */
1316 /* ----------------- */
1317
1318 "3: \n\t" /* <exit> */
1319 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha)
1320 :
1321 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory"
1322 );
1323
1324 }
1325 #define S32A_Blend_BlitRow32_PROC S32A_Blend_BlitRow32_arm
1326
1327 /* Neon version of S32_Blend_BlitRow32()
1328 * portable version is in src/core/SkBlitRow_D32.cpp
1329 */
1330 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
S32_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)1331 static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1332 const SkPMColor* SK_RESTRICT src,
1333 int count, U8CPU alpha) {
1334 SkASSERT(alpha <= 255);
1335 if (count > 0) {
1336 uint16_t src_scale = SkAlpha255To256(alpha);
1337 uint16_t dst_scale = 256 - src_scale;
1338
1339 /* run them N at a time through the NEON unit */
1340 /* note that each 1 is 4 bytes, each treated exactly the same,
1341 * so we can work under that guise. We *do* know that the src&dst
1342 * will be 32-bit aligned quantities, so we can specify that on
1343 * the load/store ops and do a neon 'reinterpret' to get us to
1344 * byte-sized (pun intended) pieces that we widen/multiply/shift
1345 * we're limited at 128 bits in the wide ops, which is 8x16bits
1346 * or a pair of 32 bit src/dsts.
1347 */
1348 /* we *could* manually unroll this loop so that we load 128 bits
1349 * (as a pair of 64s) from each of src and dst, processing them
1350 * in pieces. This might give us a little better management of
1351 * the memory latency, but my initial attempts here did not
1352 * produce an instruction stream that looked all that nice.
1353 */
1354 #define UNROLL 2
1355 while (count >= UNROLL) {
1356 uint8x8_t src_raw, dst_raw, dst_final;
1357 uint16x8_t src_wide, dst_wide;
1358
1359 /* get 64 bits of src, widen it, multiply by src_scale */
1360 src_raw = vreinterpret_u8_u32(vld1_u32(src));
1361 src_wide = vmovl_u8(src_raw);
1362 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
1363 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
1364
1365 /* ditto with dst */
1366 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
1367 dst_wide = vmovl_u8(dst_raw);
1368
1369 /* combine add with dst multiply into mul-accumulate */
1370 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
1371
1372 dst_final = vshrn_n_u16(dst_wide, 8);
1373 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
1374
1375 src += UNROLL;
1376 dst += UNROLL;
1377 count -= UNROLL;
1378 }
1379 /* RBE: well, i don't like how gcc manages src/dst across the above
1380 * loop it's constantly calculating src+bias, dst+bias and it only
1381 * adjusts the real ones when we leave the loop. Not sure why
1382 * it's "hoisting down" (hoisting implies above in my lexicon ;))
1383 * the adjustments to src/dst/count, but it does...
1384 * (might be SSA-style internal logic...
1385 */
1386
1387 #if UNROLL == 2
1388 if (count == 1) {
1389 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
1390 }
1391 #else
1392 if (count > 0) {
1393 do {
1394 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
1395 src += 1;
1396 dst += 1;
1397 } while (--count > 0);
1398 }
1399 #endif
1400
1401 #undef UNROLL
1402 }
1403 }
1404
1405 #define S32_Blend_BlitRow32_PROC S32_Blend_BlitRow32_neon
1406 #else
1407 #define S32_Blend_BlitRow32_PROC NULL
1408 #endif
1409
1410 ///////////////////////////////////////////////////////////////////////////////
1411
1412 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
1413
1414 #undef DEBUG_OPAQUE_DITHER
1415
1416 #if defined(DEBUG_OPAQUE_DITHER)
showme8(char * str,void * p,int len)1417 static void showme8(char *str, void *p, int len)
1418 {
1419 static char buf[256];
1420 char tbuf[32];
1421 int i;
1422 char *pc = (char*) p;
1423 sprintf(buf,"%8s:", str);
1424 for(i=0;i<len;i++) {
1425 sprintf(tbuf, " %02x", pc[i]);
1426 strcat(buf, tbuf);
1427 }
1428 SkDebugf("%s\n", buf);
1429 }
showme16(char * str,void * p,int len)1430 static void showme16(char *str, void *p, int len)
1431 {
1432 static char buf[256];
1433 char tbuf[32];
1434 int i;
1435 uint16_t *pc = (uint16_t*) p;
1436 sprintf(buf,"%8s:", str);
1437 len = (len / sizeof(uint16_t)); /* passed as bytes */
1438 for(i=0;i<len;i++) {
1439 sprintf(tbuf, " %04x", pc[i]);
1440 strcat(buf, tbuf);
1441 }
1442 SkDebugf("%s\n", buf);
1443 }
1444 #endif
1445
S32A_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1446 static void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1447 const SkPMColor* SK_RESTRICT src,
1448 int count, U8CPU alpha, int x, int y) {
1449 SkASSERT(255 == alpha);
1450
1451 #define UNROLL 8
1452
1453 if (count >= UNROLL) {
1454 uint8x8_t dbase;
1455
1456 #if defined(DEBUG_OPAQUE_DITHER)
1457 uint16_t tmpbuf[UNROLL];
1458 int td[UNROLL];
1459 int tdv[UNROLL];
1460 int ta[UNROLL];
1461 int tap[UNROLL];
1462 uint16_t in_dst[UNROLL];
1463 int offset = 0;
1464 int noisy = 0;
1465 #endif
1466
1467 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1468 dbase = vld1_u8(dstart);
1469
1470 do {
1471 uint8x8_t sr, sg, sb, sa, d;
1472 uint16x8_t dst8, scale8, alpha8;
1473 uint16x8_t dst_r, dst_g, dst_b;
1474
1475 #if defined(DEBUG_OPAQUE_DITHER)
1476 /* calculate 8 elements worth into a temp buffer */
1477 {
1478 int my_y = y;
1479 int my_x = x;
1480 SkPMColor* my_src = (SkPMColor*)src;
1481 uint16_t* my_dst = dst;
1482 int i;
1483
1484 DITHER_565_SCAN(my_y);
1485 for(i=0;i<UNROLL;i++) {
1486 SkPMColor c = *my_src++;
1487 SkPMColorAssert(c);
1488 if (c) {
1489 unsigned a = SkGetPackedA32(c);
1490
1491 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1492 tdv[i] = DITHER_VALUE(my_x);
1493 ta[i] = a;
1494 tap[i] = SkAlpha255To256(a);
1495 td[i] = d;
1496
1497 unsigned sr = SkGetPackedR32(c);
1498 unsigned sg = SkGetPackedG32(c);
1499 unsigned sb = SkGetPackedB32(c);
1500 sr = SkDITHER_R32_FOR_565(sr, d);
1501 sg = SkDITHER_G32_FOR_565(sg, d);
1502 sb = SkDITHER_B32_FOR_565(sb, d);
1503
1504 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1505 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1506 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1507 // now src and dst expanded are in g:11 r:10 x:1 b:10
1508 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1509 td[i] = d;
1510
1511 } else {
1512 tmpbuf[i] = *my_dst;
1513 ta[i] = tdv[i] = td[i] = 0xbeef;
1514 }
1515 in_dst[i] = *my_dst;
1516 my_dst += 1;
1517 DITHER_INC_X(my_x);
1518 }
1519 }
1520 #endif
1521
1522 /* source is in ABGR */
1523 {
1524 register uint8x8_t d0 asm("d0");
1525 register uint8x8_t d1 asm("d1");
1526 register uint8x8_t d2 asm("d2");
1527 register uint8x8_t d3 asm("d3");
1528
1529 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1530 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1531 : "r" (src)
1532 );
1533 sr = d0; sg = d1; sb = d2; sa = d3;
1534 }
1535
1536 /* calculate 'd', which will be 0..7 */
1537 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
1538 #if defined(SK_BUILD_FOR_ANDROID)
1539 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1540 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
1541 #else
1542 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
1543 #endif
1544 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
1545 d = vshrn_n_u16(alpha8, 8); /* narrowing too */
1546
1547 /* sr = sr - (sr>>5) + d */
1548 /* watching for 8-bit overflow. d is 0..7; risky range of
1549 * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1550 * safe as long as we do ((sr-sr>>5) + d) */
1551 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1552 sr = vadd_u8(sr, d);
1553
1554 /* sb = sb - (sb>>5) + d */
1555 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1556 sb = vadd_u8(sb, d);
1557
1558 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1559 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1560 sg = vadd_u8(sg, vshr_n_u8(d,1));
1561
1562 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
1563 dst8 = vld1q_u16(dst);
1564 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
1565 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
1566 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */
1567
1568 /* blend */
1569 #if 1
1570 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1571 /* originally 255-sa + 1 */
1572 scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1573 #else
1574 scale8 = vsubw_u8(vdupq_n_u16(255), sa);
1575 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
1576 #endif
1577
1578 #if 1
1579 /* combine the addq and mul, save 3 insns */
1580 scale8 = vshrq_n_u16(scale8, 3);
1581 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1582 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1583 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1584 #else
1585 /* known correct, but +3 insns over above */
1586 scale8 = vshrq_n_u16(scale8, 3);
1587 dst_b = vmulq_u16(dst_b, scale8);
1588 dst_g = vmulq_u16(dst_g, scale8);
1589 dst_r = vmulq_u16(dst_r, scale8);
1590
1591 /* combine */
1592 /* NB: vshll widens, need to preserve those bits */
1593 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
1594 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
1595 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
1596 #endif
1597
1598 /* repack to store */
1599 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
1600 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1601 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1602
1603 vst1q_u16(dst, dst8);
1604
1605 #if defined(DEBUG_OPAQUE_DITHER)
1606 /* verify my 8 elements match the temp buffer */
1607 {
1608 int i, bad=0;
1609 static int invocation;
1610
1611 for (i=0;i<UNROLL;i++)
1612 if (tmpbuf[i] != dst[i]) bad=1;
1613 if (bad) {
1614 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1615 invocation, offset);
1616 SkDebugf(" alpha 0x%x\n", alpha);
1617 for (i=0;i<UNROLL;i++)
1618 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1619 i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
1620 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
1621
1622 showme16("alpha8", &alpha8, sizeof(alpha8));
1623 showme16("scale8", &scale8, sizeof(scale8));
1624 showme8("d", &d, sizeof(d));
1625 showme16("dst8", &dst8, sizeof(dst8));
1626 showme16("dst_b", &dst_b, sizeof(dst_b));
1627 showme16("dst_g", &dst_g, sizeof(dst_g));
1628 showme16("dst_r", &dst_r, sizeof(dst_r));
1629 showme8("sb", &sb, sizeof(sb));
1630 showme8("sg", &sg, sizeof(sg));
1631 showme8("sr", &sr, sizeof(sr));
1632
1633 /* cop out */
1634 return;
1635 }
1636 offset += UNROLL;
1637 invocation++;
1638 }
1639 #endif
1640
1641 dst += UNROLL;
1642 src += UNROLL;
1643 count -= UNROLL;
1644 /* skip x += UNROLL, since it's unchanged mod-4 */
1645 } while (count >= UNROLL);
1646 }
1647 #undef UNROLL
1648
1649 /* residuals */
1650 if (count > 0) {
1651 DITHER_565_SCAN(y);
1652 do {
1653 SkPMColor c = *src++;
1654 SkPMColorAssert(c);
1655 if (c) {
1656 unsigned a = SkGetPackedA32(c);
1657
1658 // dither and alpha are just temporary variables to work-around
1659 // an ICE in debug.
1660 unsigned dither = DITHER_VALUE(x);
1661 unsigned alpha = SkAlpha255To256(a);
1662 int d = SkAlphaMul(dither, alpha);
1663
1664 unsigned sr = SkGetPackedR32(c);
1665 unsigned sg = SkGetPackedG32(c);
1666 unsigned sb = SkGetPackedB32(c);
1667 sr = SkDITHER_R32_FOR_565(sr, d);
1668 sg = SkDITHER_G32_FOR_565(sg, d);
1669 sb = SkDITHER_B32_FOR_565(sb, d);
1670
1671 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1672 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1673 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1674 // now src and dst expanded are in g:11 r:10 x:1 b:10
1675 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1676 }
1677 dst += 1;
1678 DITHER_INC_X(x);
1679 } while (--count != 0);
1680 }
1681 }
1682
1683 #define S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon
1684 #else
1685 #define S32A_D565_Opaque_Dither_PROC NULL
1686 #endif
1687
1688 ///////////////////////////////////////////////////////////////////////////////
1689
1690 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
1691 /* 2009/10/27: RBE says "a work in progress"; debugging says ok;
1692 * speedup untested, but ARM version is 26 insns/iteration and
1693 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
1694 * which is 10x the native version; that's pure instruction counts,
1695 * not accounting for any instruction or memory latencies.
1696 */
1697
1698 #undef DEBUG_S32_OPAQUE_DITHER
1699
S32_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1700 static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1701 const SkPMColor* SK_RESTRICT src,
1702 int count, U8CPU alpha, int x, int y) {
1703 SkASSERT(255 == alpha);
1704
1705 #define UNROLL 8
1706 if (count >= UNROLL) {
1707 uint8x8_t d;
1708 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1709 d = vld1_u8(dstart);
1710
1711 while (count >= UNROLL) {
1712 uint8x8_t sr, sg, sb, sa;
1713 uint16x8_t dr, dg, db, da;
1714 uint16x8_t dst8;
1715
1716 /* source is in ABGR ordering (R == lsb) */
1717 {
1718 register uint8x8_t d0 asm("d0");
1719 register uint8x8_t d1 asm("d1");
1720 register uint8x8_t d2 asm("d2");
1721 register uint8x8_t d3 asm("d3");
1722
1723 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1724 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1725 : "r" (src)
1726 );
1727 sr = d0; sg = d1; sb = d2; sa = d3;
1728 }
1729 /* XXX: if we want to prefetch, hide it in the above asm()
1730 * using the gcc __builtin_prefetch(), the prefetch will
1731 * fall to the bottom of the loop -- it won't stick up
1732 * at the top of the loop, just after the vld4.
1733 */
1734
1735 /* sr = sr - (sr>>5) + d */
1736 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1737 dr = vaddl_u8(sr, d);
1738
1739 /* sb = sb - (sb>>5) + d */
1740 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1741 db = vaddl_u8(sb, d);
1742
1743 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1744 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1745 dg = vaddl_u8(sg, vshr_n_u8(d,1));
1746 /* XXX: check that the "d>>1" here is hoisted */
1747
1748 /* pack high bits of each into 565 format (rgb, b is lsb) */
1749 dst8 = vshrq_n_u16(db, 3);
1750 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1751 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
1752
1753 /* store it */
1754 vst1q_u16(dst, dst8);
1755
1756 #if defined(DEBUG_S32_OPAQUE_DITHER)
1757 /* always good to know if we generated good results */
1758 {
1759 int i, myx = x, myy = y;
1760 DITHER_565_SCAN(myy);
1761 for (i=0;i<UNROLL;i++) {
1762 SkPMColor c = src[i];
1763 unsigned dither = DITHER_VALUE(myx);
1764 uint16_t val = SkDitherRGB32To565(c, dither);
1765 if (val != dst[i]) {
1766 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1767 c, dither, val, dst[i], dstart[i]);
1768 }
1769 DITHER_INC_X(myx);
1770 }
1771 }
1772 #endif
1773
1774 dst += UNROLL;
1775 src += UNROLL;
1776 count -= UNROLL;
1777 x += UNROLL; /* probably superfluous */
1778 }
1779 }
1780 #undef UNROLL
1781
1782 /* residuals */
1783 if (count > 0) {
1784 DITHER_565_SCAN(y);
1785 do {
1786 SkPMColor c = *src++;
1787 SkPMColorAssert(c);
1788 SkASSERT(SkGetPackedA32(c) == 255);
1789
1790 unsigned dither = DITHER_VALUE(x);
1791 *dst++ = SkDitherRGB32To565(c, dither);
1792 DITHER_INC_X(x);
1793 } while (--count != 0);
1794 }
1795 }
1796
1797 #define S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon
1798 #else
1799 #define S32_D565_Opaque_Dither_PROC NULL
1800 #endif
1801
1802 ///////////////////////////////////////////////////////////////////////////////
1803
1804 static const SkBlitRow::Proc platform_565_procs[] = {
1805 // no dither
1806 S32_D565_Opaque_PROC,
1807 S32_D565_Blend_PROC,
1808 S32A_D565_Opaque_PROC,
1809 S32A_D565_Blend_PROC,
1810
1811 // dither
1812 S32_D565_Opaque_Dither_PROC,
1813 S32_D565_Blend_Dither_PROC,
1814 S32A_D565_Opaque_Dither_PROC,
1815 NULL, // S32A_D565_Blend_Dither
1816 };
1817
1818 static const SkBlitRow::Proc platform_4444_procs[] = {
1819 // no dither
1820 NULL, // S32_D4444_Opaque,
1821 NULL, // S32_D4444_Blend,
1822 NULL, // S32A_D4444_Opaque,
1823 NULL, // S32A_D4444_Blend,
1824
1825 // dither
1826 NULL, // S32_D4444_Opaque_Dither,
1827 NULL, // S32_D4444_Blend_Dither,
1828 NULL, // S32A_D4444_Opaque_Dither,
1829 NULL, // S32A_D4444_Blend_Dither
1830 };
1831
1832 static const SkBlitRow::Proc32 platform_32_procs[] = {
1833 NULL, // S32_Opaque,
1834 S32_Blend_BlitRow32_PROC, // S32_Blend,
1835 S32A_Opaque_BlitRow32_PROC, // S32A_Opaque,
1836 S32A_Blend_BlitRow32_PROC // S32A_Blend
1837 };
1838
PlatformProcs4444(unsigned flags)1839 SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) {
1840 return platform_4444_procs[flags];
1841 }
1842
PlatformProcs565(unsigned flags)1843 SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
1844 return platform_565_procs[flags];
1845 }
1846
PlatformProcs32(unsigned flags)1847 SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
1848 return platform_32_procs[flags];
1849 }
1850
PlatformColorProc()1851 SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
1852 return NULL;
1853 }
1854
1855 ///////////////////////////////////////////////////////////////////////////////
1856
PlatformColorProcs(SkBitmap::Config dstConfig,SkMask::Format maskFormat,SkColor color)1857 SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
1858 SkMask::Format maskFormat,
1859 SkColor color) {
1860 return NULL;
1861 }
1862
PlatformBlitRowProcs16(bool isOpaque)1863 SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
1864 return NULL;
1865 }
1866
PlatformRowProcs(SkBitmap::Config dstConfig,SkMask::Format maskFormat,RowFlags flags)1867 SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig,
1868 SkMask::Format maskFormat,
1869 RowFlags flags) {
1870 return NULL;
1871 }
1872