1 /*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "SkBlitRow_opts_arm_neon.h"
9
10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h"
12 #include "SkColorPriv.h"
13 #include "SkDither.h"
14 #include "SkMathPriv.h"
15 #include "SkUtils.h"
16
17 #include "SkCachePreload_arm.h"
18 #include "SkColor_opts_neon.h"
19 #include <arm_neon.h>
20
S32_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
22 const SkPMColor* SK_RESTRICT src, int count,
23 U8CPU alpha, int /*x*/, int /*y*/) {
24 SkASSERT(255 == alpha);
25
26 while (count >= 8) {
27 uint8x8x4_t vsrc;
28 uint16x8_t vdst;
29
30 // Load
31 vsrc = vld4_u8((uint8_t*)src);
32
33 // Convert src to 565
34 vdst = SkPixel32ToPixel16_neon8(vsrc);
35
36 // Store
37 vst1q_u16(dst, vdst);
38
39 // Prepare next iteration
40 dst += 8;
41 src += 8;
42 count -= 8;
43 };
44
45 // Leftovers
46 while (count > 0) {
47 SkPMColor c = *src++;
48 SkPMColorAssert(c);
49 *dst = SkPixel32ToPixel16_ToU16(c);
50 dst++;
51 count--;
52 };
53 }
54
S32A_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)55 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
56 const SkPMColor* SK_RESTRICT src, int count,
57 U8CPU alpha, int /*x*/, int /*y*/) {
58 SkASSERT(255 == alpha);
59
60 if (count >= 8) {
61 uint16_t* SK_RESTRICT keep_dst = 0;
62
63 asm volatile (
64 "ands ip, %[count], #7 \n\t"
65 "vmov.u8 d31, #1<<7 \n\t"
66 "vld1.16 {q12}, [%[dst]] \n\t"
67 "vld4.8 {d0-d3}, [%[src]] \n\t"
68 // Thumb does not support the standard ARM conditional
69 // instructions but instead requires the 'it' instruction
70 // to signal conditional execution
71 "it eq \n\t"
72 "moveq ip, #8 \n\t"
73 "mov %[keep_dst], %[dst] \n\t"
74
75 "add %[src], %[src], ip, LSL#2 \n\t"
76 "add %[dst], %[dst], ip, LSL#1 \n\t"
77 "subs %[count], %[count], ip \n\t"
78 "b 9f \n\t"
79 // LOOP
80 "2: \n\t"
81
82 "vld1.16 {q12}, [%[dst]]! \n\t"
83 "vld4.8 {d0-d3}, [%[src]]! \n\t"
84 "vst1.16 {q10}, [%[keep_dst]] \n\t"
85 "sub %[keep_dst], %[dst], #8*2 \n\t"
86 "subs %[count], %[count], #8 \n\t"
87 "9: \n\t"
88 "pld [%[dst],#32] \n\t"
89 // expand 0565 q12 to 8888 {d4-d7}
90 "vmovn.u16 d4, q12 \n\t"
91 "vshr.u16 q11, q12, #5 \n\t"
92 "vshr.u16 q10, q12, #6+5 \n\t"
93 "vmovn.u16 d5, q11 \n\t"
94 "vmovn.u16 d6, q10 \n\t"
95 "vshl.u8 d4, d4, #3 \n\t"
96 "vshl.u8 d5, d5, #2 \n\t"
97 "vshl.u8 d6, d6, #3 \n\t"
98
99 "vmovl.u8 q14, d31 \n\t"
100 "vmovl.u8 q13, d31 \n\t"
101 "vmovl.u8 q12, d31 \n\t"
102
103 // duplicate in 4/2/1 & 8pix vsns
104 "vmvn.8 d30, d3 \n\t"
105 "vmlal.u8 q14, d30, d6 \n\t"
106 "vmlal.u8 q13, d30, d5 \n\t"
107 "vmlal.u8 q12, d30, d4 \n\t"
108 "vshr.u16 q8, q14, #5 \n\t"
109 "vshr.u16 q9, q13, #6 \n\t"
110 "vaddhn.u16 d6, q14, q8 \n\t"
111 "vshr.u16 q8, q12, #5 \n\t"
112 "vaddhn.u16 d5, q13, q9 \n\t"
113 "vqadd.u8 d6, d6, d0 \n\t" // moved up
114 "vaddhn.u16 d4, q12, q8 \n\t"
115 // intentionally don't calculate alpha
116 // result in d4-d6
117
118 "vqadd.u8 d5, d5, d1 \n\t"
119 "vqadd.u8 d4, d4, d2 \n\t"
120
121 // pack 8888 {d4-d6} to 0565 q10
122 "vshll.u8 q10, d6, #8 \n\t"
123 "vshll.u8 q3, d5, #8 \n\t"
124 "vshll.u8 q2, d4, #8 \n\t"
125 "vsri.u16 q10, q3, #5 \n\t"
126 "vsri.u16 q10, q2, #11 \n\t"
127
128 "bne 2b \n\t"
129
130 "1: \n\t"
131 "vst1.16 {q10}, [%[keep_dst]] \n\t"
132 : [count] "+r" (count)
133 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
134 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
135 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
136 "d30","d31"
137 );
138 }
139 else
140 { // handle count < 8
141 uint16_t* SK_RESTRICT keep_dst = 0;
142
143 asm volatile (
144 "vmov.u8 d31, #1<<7 \n\t"
145 "mov %[keep_dst], %[dst] \n\t"
146
147 "tst %[count], #4 \n\t"
148 "beq 14f \n\t"
149 "vld1.16 {d25}, [%[dst]]! \n\t"
150 "vld1.32 {q1}, [%[src]]! \n\t"
151
152 "14: \n\t"
153 "tst %[count], #2 \n\t"
154 "beq 12f \n\t"
155 "vld1.32 {d24[1]}, [%[dst]]! \n\t"
156 "vld1.32 {d1}, [%[src]]! \n\t"
157
158 "12: \n\t"
159 "tst %[count], #1 \n\t"
160 "beq 11f \n\t"
161 "vld1.16 {d24[1]}, [%[dst]]! \n\t"
162 "vld1.32 {d0[1]}, [%[src]]! \n\t"
163
164 "11: \n\t"
165 // unzips achieve the same as a vld4 operation
166 "vuzpq.u16 q0, q1 \n\t"
167 "vuzp.u8 d0, d1 \n\t"
168 "vuzp.u8 d2, d3 \n\t"
169 // expand 0565 q12 to 8888 {d4-d7}
170 "vmovn.u16 d4, q12 \n\t"
171 "vshr.u16 q11, q12, #5 \n\t"
172 "vshr.u16 q10, q12, #6+5 \n\t"
173 "vmovn.u16 d5, q11 \n\t"
174 "vmovn.u16 d6, q10 \n\t"
175 "vshl.u8 d4, d4, #3 \n\t"
176 "vshl.u8 d5, d5, #2 \n\t"
177 "vshl.u8 d6, d6, #3 \n\t"
178
179 "vmovl.u8 q14, d31 \n\t"
180 "vmovl.u8 q13, d31 \n\t"
181 "vmovl.u8 q12, d31 \n\t"
182
183 // duplicate in 4/2/1 & 8pix vsns
184 "vmvn.8 d30, d3 \n\t"
185 "vmlal.u8 q14, d30, d6 \n\t"
186 "vmlal.u8 q13, d30, d5 \n\t"
187 "vmlal.u8 q12, d30, d4 \n\t"
188 "vshr.u16 q8, q14, #5 \n\t"
189 "vshr.u16 q9, q13, #6 \n\t"
190 "vaddhn.u16 d6, q14, q8 \n\t"
191 "vshr.u16 q8, q12, #5 \n\t"
192 "vaddhn.u16 d5, q13, q9 \n\t"
193 "vqadd.u8 d6, d6, d0 \n\t" // moved up
194 "vaddhn.u16 d4, q12, q8 \n\t"
195 // intentionally don't calculate alpha
196 // result in d4-d6
197
198 "vqadd.u8 d5, d5, d1 \n\t"
199 "vqadd.u8 d4, d4, d2 \n\t"
200
201 // pack 8888 {d4-d6} to 0565 q10
202 "vshll.u8 q10, d6, #8 \n\t"
203 "vshll.u8 q3, d5, #8 \n\t"
204 "vshll.u8 q2, d4, #8 \n\t"
205 "vsri.u16 q10, q3, #5 \n\t"
206 "vsri.u16 q10, q2, #11 \n\t"
207
208 // store
209 "tst %[count], #4 \n\t"
210 "beq 24f \n\t"
211 "vst1.16 {d21}, [%[keep_dst]]! \n\t"
212
213 "24: \n\t"
214 "tst %[count], #2 \n\t"
215 "beq 22f \n\t"
216 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t"
217
218 "22: \n\t"
219 "tst %[count], #1 \n\t"
220 "beq 21f \n\t"
221 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t"
222
223 "21: \n\t"
224 : [count] "+r" (count)
225 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
226 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
227 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
228 "d30","d31"
229 );
230 }
231 }
232
S32A_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)233 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
234 const SkPMColor* SK_RESTRICT src, int count,
235 U8CPU alpha, int /*x*/, int /*y*/) {
236
237 U8CPU alpha_for_asm = alpha;
238
239 asm volatile (
240 /* This code implements a Neon version of S32A_D565_Blend. The output differs from
241 * the original in two respects:
242 * 1. The results have a few mismatches compared to the original code. These mismatches
243 * never exceed 1. It's possible to improve accuracy vs. a floating point
244 * implementation by introducing rounding right shifts (vrshr) for the final stage.
245 * Rounding is not present in the code below, because although results would be closer
246 * to a floating point implementation, the number of mismatches compared to the
247 * original code would be far greater.
248 * 2. On certain inputs, the original code can overflow, causing colour channels to
249 * mix. Although the Neon code can also overflow, it doesn't allow one colour channel
250 * to affect another.
251 */
252
253 #if 1
254 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
255 "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256
256 #else
257 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256
258 #endif
259 "vmov.u16 q3, #255 \n\t" // set up constant
260 "movs r4, %[count], lsr #3 \n\t" // calc. count>>3
261 "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon
262 "beq 2f \n\t" // if count8 == 0, exit
263 "vmov.u16 q15, #0x1f \n\t" // set up blue mask
264
265 "1: \n\t"
266 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels
267 "subs r4, r4, #1 \n\t" // decrement loop counter
268 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels
269 // and deinterleave
270
271 "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes
272 "vand q10, q0, q15 \n\t" // extract blue
273 "vshr.u16 q8, q0, #11 \n\t" // extract red
274 "vshr.u16 q9, q9, #10 \n\t" // extract green
275 // dstrgb = {q8, q9, q10}
276
277 "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range
278 "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range
279 "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range
280
281 "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits
282 "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits
283 "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits
284 "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits
285 // srcrgba = {q11, q12, q13, q14}
286
287 "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale
288 "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale
289 "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale
290 "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale
291
292 "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8
293 "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8)
294 // dst_scale = q2
295
296 "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale
297 "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale
298 "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale
299
300 #if 1
301 // trying for a better match with SkDiv255Round(a)
302 // C alg is: a+=128; (a+a>>8)>>8
303 // we'll use just a rounding shift [q2 is available for scratch]
304 "vrshr.u16 q11, q11, #8 \n\t" // shift down red
305 "vrshr.u16 q12, q12, #8 \n\t" // shift down green
306 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue
307 #else
308 // arm's original "truncating divide by 256"
309 "vshr.u16 q11, q11, #8 \n\t" // shift down red
310 "vshr.u16 q12, q12, #8 \n\t" // shift down green
311 "vshr.u16 q13, q13, #8 \n\t" // shift down blue
312 #endif
313
314 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue
315 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue
316 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr
317
318 "bne 1b \n\t" // if counter != 0, loop
319 "2: \n\t" // exit
320
321 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
322 :
323 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
324 );
325
326 count &= 7;
327 if (count > 0) {
328 do {
329 SkPMColor sc = *src++;
330 if (sc) {
331 uint16_t dc = *dst;
332 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
333 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
334 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
335 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
336 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
337 }
338 dst += 1;
339 } while (--count != 0);
340 }
341 }
342
343 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
344 * each dither value is spaced out into byte lanes, and repeated
345 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
346 * start of each row.
347 */
348 static const uint8_t gDitherMatrix_Neon[48] = {
349 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
350 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
351 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
352 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
353
354 };
355
S32_D565_Blend_Dither_neon(uint16_t * dst,const SkPMColor * src,int count,U8CPU alpha,int x,int y)356 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
357 int count, U8CPU alpha, int x, int y)
358 {
359
360 SkASSERT(255 > alpha);
361
362 // rescale alpha to range 1 - 256
363 int scale = SkAlpha255To256(alpha);
364
365 if (count >= 8) {
366 /* select row and offset for dither array */
367 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
368
369 uint8x8_t vdither = vld1_u8(dstart); // load dither values
370 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
371
372 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg
373 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask
374
375 do {
376
377 uint8x8_t vsrc_r, vsrc_g, vsrc_b;
378 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
379 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
380 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
381 uint16x8_t vdst;
382 uint16x8_t vdst_r, vdst_g, vdst_b;
383 int16x8_t vres_r, vres_g, vres_b;
384 int8x8_t vres8_r, vres8_g, vres8_b;
385
386 // Load source and add dither
387 {
388 register uint8x8_t d0 asm("d0");
389 register uint8x8_t d1 asm("d1");
390 register uint8x8_t d2 asm("d2");
391 register uint8x8_t d3 asm("d3");
392
393 asm (
394 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
395 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
396 :
397 );
398 vsrc_g = d1;
399 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
400 vsrc_r = d2; vsrc_b = d0;
401 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
402 vsrc_r = d0; vsrc_b = d2;
403 #endif
404 }
405
406 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
407 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
408 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
409
410 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
411 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen
412 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen
413
414 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result
415 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result
416 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result
417
418 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
419 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
420 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
421
422 // Load dst and unpack
423 vdst = vld1q_u16(dst);
424 vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green
425 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
426 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue
427
428 // subtract dst from src and widen
429 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
430 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
431 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
432
433 // multiply diffs by scale and shift
434 vres_r = vmulq_s16(vres_r, vscale);
435 vres_g = vmulq_s16(vres_g, vscale);
436 vres_b = vmulq_s16(vres_b, vscale);
437
438 vres8_r = vshrn_n_s16(vres_r, 8);
439 vres8_g = vshrn_n_s16(vres_g, 8);
440 vres8_b = vshrn_n_s16(vres_b, 8);
441
442 // add dst to result
443 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
444 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
445 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
446
447 // put result into 565 format
448 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue
449 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
450
451 // Store result
452 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
453
454 // Next iteration
455 dst += 8;
456 count -= 8;
457
458 } while (count >= 8);
459 }
460
461 // Leftovers
462 if (count > 0) {
463 int scale = SkAlpha255To256(alpha);
464 DITHER_565_SCAN(y);
465 do {
466 SkPMColor c = *src++;
467 SkPMColorAssert(c);
468
469 int dither = DITHER_VALUE(x);
470 int sr = SkGetPackedR32(c);
471 int sg = SkGetPackedG32(c);
472 int sb = SkGetPackedB32(c);
473 sr = SkDITHER_R32To565(sr, dither);
474 sg = SkDITHER_G32To565(sg, dither);
475 sb = SkDITHER_B32To565(sb, dither);
476
477 uint16_t d = *dst;
478 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
479 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
480 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
481 DITHER_INC_X(x);
482 } while (--count != 0);
483 }
484 }
485
S32A_Opaque_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)486 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
487 const SkPMColor* SK_RESTRICT src,
488 int count, U8CPU alpha) {
489
490 SkASSERT(255 == alpha);
491 if (count > 0) {
492
493
494 uint8x8_t alpha_mask;
495
496 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
497 alpha_mask = vld1_u8(alpha_mask_setup);
498
499 /* do the NEON unrolled code */
500 #define UNROLL 4
501 while (count >= UNROLL) {
502 uint8x8_t src_raw, dst_raw, dst_final;
503 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
504
505 /* The two prefetches below may make the code slighlty
506 * slower for small values of count but are worth having
507 * in the general case.
508 */
509 __builtin_prefetch(src+32);
510 __builtin_prefetch(dst+32);
511
512 /* get the source */
513 src_raw = vreinterpret_u8_u32(vld1_u32(src));
514 #if UNROLL > 2
515 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
516 #endif
517
518 /* get and hold the dst too */
519 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
520 #if UNROLL > 2
521 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
522 #endif
523
524 /* 1st and 2nd bits of the unrolling */
525 {
526 uint8x8_t dst_cooked;
527 uint16x8_t dst_wide;
528 uint8x8_t alpha_narrow;
529 uint16x8_t alpha_wide;
530
531 /* get the alphas spread out properly */
532 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
533 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
534
535 /* spread the dest */
536 dst_wide = vmovl_u8(dst_raw);
537
538 /* alpha mul the dest */
539 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
540 dst_cooked = vshrn_n_u16(dst_wide, 8);
541
542 /* sum -- ignoring any byte lane overflows */
543 dst_final = vadd_u8(src_raw, dst_cooked);
544 }
545
546 #if UNROLL > 2
547 /* the 3rd and 4th bits of our unrolling */
548 {
549 uint8x8_t dst_cooked;
550 uint16x8_t dst_wide;
551 uint8x8_t alpha_narrow;
552 uint16x8_t alpha_wide;
553
554 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
555 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
556
557 /* spread the dest */
558 dst_wide = vmovl_u8(dst_raw_2);
559
560 /* alpha mul the dest */
561 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
562 dst_cooked = vshrn_n_u16(dst_wide, 8);
563
564 /* sum -- ignoring any byte lane overflows */
565 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
566 }
567 #endif
568
569 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
570 #if UNROLL > 2
571 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
572 #endif
573
574 src += UNROLL;
575 dst += UNROLL;
576 count -= UNROLL;
577 }
578 #undef UNROLL
579
580 /* do any residual iterations */
581 while (--count >= 0) {
582 *dst = SkPMSrcOver(*src, *dst);
583 src += 1;
584 dst += 1;
585 }
586 }
587 }
588
S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)589 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
590 const SkPMColor* SK_RESTRICT src,
591 int count, U8CPU alpha) {
592 SkASSERT(255 == alpha);
593
594 if (count <= 0)
595 return;
596
597 /* Use these to check if src is transparent or opaque */
598 const unsigned int ALPHA_OPAQ = 0xFF000000;
599 const unsigned int ALPHA_TRANS = 0x00FFFFFF;
600
601 #define UNROLL 4
602 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
603 const SkPMColor* SK_RESTRICT src_temp = src;
604
605 /* set up the NEON variables */
606 uint8x8_t alpha_mask;
607 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
608 alpha_mask = vld1_u8(alpha_mask_setup);
609
610 uint8x8_t src_raw, dst_raw, dst_final;
611 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
612 uint8x8_t dst_cooked;
613 uint16x8_t dst_wide;
614 uint8x8_t alpha_narrow;
615 uint16x8_t alpha_wide;
616
617 /* choose the first processing type */
618 if( src >= src_end)
619 goto TAIL;
620 if(*src <= ALPHA_TRANS)
621 goto ALPHA_0;
622 if(*src >= ALPHA_OPAQ)
623 goto ALPHA_255;
624 /* fall-thru */
625
626 ALPHA_1_TO_254:
627 do {
628
629 /* get the source */
630 src_raw = vreinterpret_u8_u32(vld1_u32(src));
631 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
632
633 /* get and hold the dst too */
634 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
635 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
636
637
638 /* get the alphas spread out properly */
639 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
640 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
641 /* we collapsed (255-a)+1 ... */
642 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
643
644 /* spread the dest */
645 dst_wide = vmovl_u8(dst_raw);
646
647 /* alpha mul the dest */
648 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
649 dst_cooked = vshrn_n_u16(dst_wide, 8);
650
651 /* sum -- ignoring any byte lane overflows */
652 dst_final = vadd_u8(src_raw, dst_cooked);
653
654 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
655 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
656 /* we collapsed (255-a)+1 ... */
657 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
658
659 /* spread the dest */
660 dst_wide = vmovl_u8(dst_raw_2);
661
662 /* alpha mul the dest */
663 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
664 dst_cooked = vshrn_n_u16(dst_wide, 8);
665
666 /* sum -- ignoring any byte lane overflows */
667 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
668
669 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
670 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
671
672 src += UNROLL;
673 dst += UNROLL;
674
675 /* if 2 of the next pixels aren't between 1 and 254
676 it might make sense to go to the optimized loops */
677 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
678 break;
679
680 } while(src < src_end);
681
682 if (src >= src_end)
683 goto TAIL;
684
685 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
686 goto ALPHA_255;
687
688 /*fall-thru*/
689
690 ALPHA_0:
691
692 /*In this state, we know the current alpha is 0 and
693 we optimize for the next alpha also being zero. */
694 src_temp = src; //so we don't have to increment dst every time
695 do {
696 if(*(++src) > ALPHA_TRANS)
697 break;
698 if(*(++src) > ALPHA_TRANS)
699 break;
700 if(*(++src) > ALPHA_TRANS)
701 break;
702 if(*(++src) > ALPHA_TRANS)
703 break;
704 } while(src < src_end);
705
706 dst += (src - src_temp);
707
708 /* no longer alpha 0, so determine where to go next. */
709 if( src >= src_end)
710 goto TAIL;
711 if(*src >= ALPHA_OPAQ)
712 goto ALPHA_255;
713 else
714 goto ALPHA_1_TO_254;
715
716 ALPHA_255:
717 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
718 dst[0]=src[0];
719 dst[1]=src[1];
720 dst[2]=src[2];
721 dst[3]=src[3];
722 src+=UNROLL;
723 dst+=UNROLL;
724 if(src >= src_end)
725 goto TAIL;
726 }
727
728 //Handle remainder.
729 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
730 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
731 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
732 }
733 }
734
735 if( src >= src_end)
736 goto TAIL;
737 if(*src <= ALPHA_TRANS)
738 goto ALPHA_0;
739 else
740 goto ALPHA_1_TO_254;
741
742 TAIL:
743 /* do any residual iterations */
744 src_end += UNROLL + 1; //goto the real end
745 while(src != src_end) {
746 if( *src != 0 ) {
747 if( *src >= ALPHA_OPAQ ) {
748 *dst = *src;
749 }
750 else {
751 *dst = SkPMSrcOver(*src, *dst);
752 }
753 }
754 src++;
755 dst++;
756 }
757
758 #undef UNROLL
759 return;
760 }
761
762 /* Neon version of S32_Blend_BlitRow32()
763 * portable version is in src/core/SkBlitRow_D32.cpp
764 */
S32_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)765 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
766 const SkPMColor* SK_RESTRICT src,
767 int count, U8CPU alpha) {
768 SkASSERT(alpha <= 255);
769 if (count > 0) {
770 uint16_t src_scale = SkAlpha255To256(alpha);
771 uint16_t dst_scale = 256 - src_scale;
772
773 /* run them N at a time through the NEON unit */
774 /* note that each 1 is 4 bytes, each treated exactly the same,
775 * so we can work under that guise. We *do* know that the src&dst
776 * will be 32-bit aligned quantities, so we can specify that on
777 * the load/store ops and do a neon 'reinterpret' to get us to
778 * byte-sized (pun intended) pieces that we widen/multiply/shift
779 * we're limited at 128 bits in the wide ops, which is 8x16bits
780 * or a pair of 32 bit src/dsts.
781 */
782 /* we *could* manually unroll this loop so that we load 128 bits
783 * (as a pair of 64s) from each of src and dst, processing them
784 * in pieces. This might give us a little better management of
785 * the memory latency, but my initial attempts here did not
786 * produce an instruction stream that looked all that nice.
787 */
788 #define UNROLL 2
789 while (count >= UNROLL) {
790 uint8x8_t src_raw, dst_raw, dst_final;
791 uint16x8_t src_wide, dst_wide;
792
793 /* get 64 bits of src, widen it, multiply by src_scale */
794 src_raw = vreinterpret_u8_u32(vld1_u32(src));
795 src_wide = vmovl_u8(src_raw);
796 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
797 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
798
799 /* ditto with dst */
800 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
801 dst_wide = vmovl_u8(dst_raw);
802
803 /* combine add with dst multiply into mul-accumulate */
804 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
805
806 dst_final = vshrn_n_u16(dst_wide, 8);
807 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
808
809 src += UNROLL;
810 dst += UNROLL;
811 count -= UNROLL;
812 }
813 /* RBE: well, i don't like how gcc manages src/dst across the above
814 * loop it's constantly calculating src+bias, dst+bias and it only
815 * adjusts the real ones when we leave the loop. Not sure why
816 * it's "hoisting down" (hoisting implies above in my lexicon ;))
817 * the adjustments to src/dst/count, but it does...
818 * (might be SSA-style internal logic...
819 */
820
821 #if UNROLL == 2
822 if (count == 1) {
823 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
824 }
825 #else
826 if (count > 0) {
827 do {
828 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
829 src += 1;
830 dst += 1;
831 } while (--count > 0);
832 }
833 #endif
834
835 #undef UNROLL
836 }
837 }
838
S32A_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)839 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
840 const SkPMColor* SK_RESTRICT src,
841 int count, U8CPU alpha) {
842
843 SkASSERT(255 >= alpha);
844
845 if (count <= 0) {
846 return;
847 }
848
849 unsigned alpha256 = SkAlpha255To256(alpha);
850
851 // First deal with odd counts
852 if (count & 1) {
853 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
854 uint16x8_t vdst_wide, vsrc_wide;
855 unsigned dst_scale;
856
857 // Load
858 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
859 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
860
861 // Calc dst_scale
862 dst_scale = vget_lane_u8(vsrc, 3);
863 dst_scale *= alpha256;
864 dst_scale >>= 8;
865 dst_scale = 256 - dst_scale;
866
867 // Process src
868 vsrc_wide = vmovl_u8(vsrc);
869 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
870
871 // Process dst
872 vdst_wide = vmovl_u8(vdst);
873 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
874
875 // Combine
876 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
877
878 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
879 dst++;
880 src++;
881 count--;
882 }
883
884 if (count) {
885 uint8x8_t alpha_mask;
886 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
887 alpha_mask = vld1_u8(alpha_mask_setup);
888
889 do {
890
891 uint8x8_t vsrc, vdst, vres, vsrc_alphas;
892 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
893
894 __builtin_prefetch(src+32);
895 __builtin_prefetch(dst+32);
896
897 // Load
898 vsrc = vreinterpret_u8_u32(vld1_u32(src));
899 vdst = vreinterpret_u8_u32(vld1_u32(dst));
900
901 // Prepare src_scale
902 vsrc_scale = vdupq_n_u16(alpha256);
903
904 // Calc dst_scale
905 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
906 vdst_scale = vmovl_u8(vsrc_alphas);
907 vdst_scale *= vsrc_scale;
908 vdst_scale = vshrq_n_u16(vdst_scale, 8);
909 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
910
911 // Process src
912 vsrc_wide = vmovl_u8(vsrc);
913 vsrc_wide *= vsrc_scale;
914
915 // Process dst
916 vdst_wide = vmovl_u8(vdst);
917 vdst_wide *= vdst_scale;
918
919 // Combine
920 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
921
922 vst1_u32(dst, vreinterpret_u32_u8(vres));
923
924 src += 2;
925 dst += 2;
926 count -= 2;
927 } while(count);
928 }
929 }
930
931 ///////////////////////////////////////////////////////////////////////////////
932
933 #undef DEBUG_OPAQUE_DITHER
934
935 #if defined(DEBUG_OPAQUE_DITHER)
showme8(char * str,void * p,int len)936 static void showme8(char *str, void *p, int len)
937 {
938 static char buf[256];
939 char tbuf[32];
940 int i;
941 char *pc = (char*) p;
942 sprintf(buf,"%8s:", str);
943 for(i=0;i<len;i++) {
944 sprintf(tbuf, " %02x", pc[i]);
945 strcat(buf, tbuf);
946 }
947 SkDebugf("%s\n", buf);
948 }
showme16(char * str,void * p,int len)949 static void showme16(char *str, void *p, int len)
950 {
951 static char buf[256];
952 char tbuf[32];
953 int i;
954 uint16_t *pc = (uint16_t*) p;
955 sprintf(buf,"%8s:", str);
956 len = (len / sizeof(uint16_t)); /* passed as bytes */
957 for(i=0;i<len;i++) {
958 sprintf(tbuf, " %04x", pc[i]);
959 strcat(buf, tbuf);
960 }
961 SkDebugf("%s\n", buf);
962 }
963 #endif
964
S32A_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)965 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
966 const SkPMColor* SK_RESTRICT src,
967 int count, U8CPU alpha, int x, int y) {
968 SkASSERT(255 == alpha);
969
970 #define UNROLL 8
971
972 if (count >= UNROLL) {
973 uint8x8_t dbase;
974
975 #if defined(DEBUG_OPAQUE_DITHER)
976 uint16_t tmpbuf[UNROLL];
977 int td[UNROLL];
978 int tdv[UNROLL];
979 int ta[UNROLL];
980 int tap[UNROLL];
981 uint16_t in_dst[UNROLL];
982 int offset = 0;
983 int noisy = 0;
984 #endif
985
986 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
987 dbase = vld1_u8(dstart);
988
989 do {
990 uint8x8_t sr, sg, sb, sa, d;
991 uint16x8_t dst8, scale8, alpha8;
992 uint16x8_t dst_r, dst_g, dst_b;
993
994 #if defined(DEBUG_OPAQUE_DITHER)
995 /* calculate 8 elements worth into a temp buffer */
996 {
997 int my_y = y;
998 int my_x = x;
999 SkPMColor* my_src = (SkPMColor*)src;
1000 uint16_t* my_dst = dst;
1001 int i;
1002
1003 DITHER_565_SCAN(my_y);
1004 for(i=0;i<UNROLL;i++) {
1005 SkPMColor c = *my_src++;
1006 SkPMColorAssert(c);
1007 if (c) {
1008 unsigned a = SkGetPackedA32(c);
1009
1010 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1011 tdv[i] = DITHER_VALUE(my_x);
1012 ta[i] = a;
1013 tap[i] = SkAlpha255To256(a);
1014 td[i] = d;
1015
1016 unsigned sr = SkGetPackedR32(c);
1017 unsigned sg = SkGetPackedG32(c);
1018 unsigned sb = SkGetPackedB32(c);
1019 sr = SkDITHER_R32_FOR_565(sr, d);
1020 sg = SkDITHER_G32_FOR_565(sg, d);
1021 sb = SkDITHER_B32_FOR_565(sb, d);
1022
1023 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1024 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1025 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1026 // now src and dst expanded are in g:11 r:10 x:1 b:10
1027 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1028 td[i] = d;
1029
1030 } else {
1031 tmpbuf[i] = *my_dst;
1032 ta[i] = tdv[i] = td[i] = 0xbeef;
1033 }
1034 in_dst[i] = *my_dst;
1035 my_dst += 1;
1036 DITHER_INC_X(my_x);
1037 }
1038 }
1039 #endif
1040
1041 /* source is in ABGR */
1042 {
1043 register uint8x8_t d0 asm("d0");
1044 register uint8x8_t d1 asm("d1");
1045 register uint8x8_t d2 asm("d2");
1046 register uint8x8_t d3 asm("d3");
1047
1048 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1049 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1050 : "r" (src)
1051 );
1052 sr = d0; sg = d1; sb = d2; sa = d3;
1053 }
1054
1055 /* calculate 'd', which will be 0..7 */
1056 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
1057 #if defined(SK_BUILD_FOR_ANDROID)
1058 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1059 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
1060 #else
1061 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
1062 #endif
1063 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
1064 d = vshrn_n_u16(alpha8, 8); /* narrowing too */
1065
1066 /* sr = sr - (sr>>5) + d */
1067 /* watching for 8-bit overflow. d is 0..7; risky range of
1068 * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1069 * safe as long as we do ((sr-sr>>5) + d) */
1070 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1071 sr = vadd_u8(sr, d);
1072
1073 /* sb = sb - (sb>>5) + d */
1074 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1075 sb = vadd_u8(sb, d);
1076
1077 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1078 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1079 sg = vadd_u8(sg, vshr_n_u8(d,1));
1080
1081 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
1082 dst8 = vld1q_u16(dst);
1083 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
1084 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
1085 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */
1086
1087 /* blend */
1088 #if 1
1089 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1090 /* originally 255-sa + 1 */
1091 scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1092 #else
1093 scale8 = vsubw_u8(vdupq_n_u16(255), sa);
1094 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
1095 #endif
1096
1097 #if 1
1098 /* combine the addq and mul, save 3 insns */
1099 scale8 = vshrq_n_u16(scale8, 3);
1100 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1101 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1102 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1103 #else
1104 /* known correct, but +3 insns over above */
1105 scale8 = vshrq_n_u16(scale8, 3);
1106 dst_b = vmulq_u16(dst_b, scale8);
1107 dst_g = vmulq_u16(dst_g, scale8);
1108 dst_r = vmulq_u16(dst_r, scale8);
1109
1110 /* combine */
1111 /* NB: vshll widens, need to preserve those bits */
1112 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
1113 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
1114 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
1115 #endif
1116
1117 /* repack to store */
1118 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
1119 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1120 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1121
1122 vst1q_u16(dst, dst8);
1123
1124 #if defined(DEBUG_OPAQUE_DITHER)
1125 /* verify my 8 elements match the temp buffer */
1126 {
1127 int i, bad=0;
1128 static int invocation;
1129
1130 for (i=0;i<UNROLL;i++)
1131 if (tmpbuf[i] != dst[i]) bad=1;
1132 if (bad) {
1133 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1134 invocation, offset);
1135 SkDebugf(" alpha 0x%x\n", alpha);
1136 for (i=0;i<UNROLL;i++)
1137 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1138 i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
1139 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
1140
1141 showme16("alpha8", &alpha8, sizeof(alpha8));
1142 showme16("scale8", &scale8, sizeof(scale8));
1143 showme8("d", &d, sizeof(d));
1144 showme16("dst8", &dst8, sizeof(dst8));
1145 showme16("dst_b", &dst_b, sizeof(dst_b));
1146 showme16("dst_g", &dst_g, sizeof(dst_g));
1147 showme16("dst_r", &dst_r, sizeof(dst_r));
1148 showme8("sb", &sb, sizeof(sb));
1149 showme8("sg", &sg, sizeof(sg));
1150 showme8("sr", &sr, sizeof(sr));
1151
1152 /* cop out */
1153 return;
1154 }
1155 offset += UNROLL;
1156 invocation++;
1157 }
1158 #endif
1159
1160 dst += UNROLL;
1161 src += UNROLL;
1162 count -= UNROLL;
1163 /* skip x += UNROLL, since it's unchanged mod-4 */
1164 } while (count >= UNROLL);
1165 }
1166 #undef UNROLL
1167
1168 /* residuals */
1169 if (count > 0) {
1170 DITHER_565_SCAN(y);
1171 do {
1172 SkPMColor c = *src++;
1173 SkPMColorAssert(c);
1174 if (c) {
1175 unsigned a = SkGetPackedA32(c);
1176
1177 // dither and alpha are just temporary variables to work-around
1178 // an ICE in debug.
1179 unsigned dither = DITHER_VALUE(x);
1180 unsigned alpha = SkAlpha255To256(a);
1181 int d = SkAlphaMul(dither, alpha);
1182
1183 unsigned sr = SkGetPackedR32(c);
1184 unsigned sg = SkGetPackedG32(c);
1185 unsigned sb = SkGetPackedB32(c);
1186 sr = SkDITHER_R32_FOR_565(sr, d);
1187 sg = SkDITHER_G32_FOR_565(sg, d);
1188 sb = SkDITHER_B32_FOR_565(sb, d);
1189
1190 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1191 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1192 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1193 // now src and dst expanded are in g:11 r:10 x:1 b:10
1194 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1195 }
1196 dst += 1;
1197 DITHER_INC_X(x);
1198 } while (--count != 0);
1199 }
1200 }
1201
1202 ///////////////////////////////////////////////////////////////////////////////
1203
1204 #undef DEBUG_S32_OPAQUE_DITHER
1205
S32_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1206 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1207 const SkPMColor* SK_RESTRICT src,
1208 int count, U8CPU alpha, int x, int y) {
1209 SkASSERT(255 == alpha);
1210
1211 #define UNROLL 8
1212 if (count >= UNROLL) {
1213 uint8x8_t d;
1214 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1215 d = vld1_u8(dstart);
1216
1217 while (count >= UNROLL) {
1218 uint8x8_t sr, sg, sb;
1219 uint16x8_t dr, dg, db;
1220 uint16x8_t dst8;
1221
1222 {
1223 register uint8x8_t d0 asm("d0");
1224 register uint8x8_t d1 asm("d1");
1225 register uint8x8_t d2 asm("d2");
1226 register uint8x8_t d3 asm("d3");
1227
1228 asm (
1229 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1230 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1231 :
1232 );
1233 sg = d1;
1234 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1235 sr = d2; sb = d0;
1236 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1237 sr = d0; sb = d2;
1238 #endif
1239 }
1240 /* XXX: if we want to prefetch, hide it in the above asm()
1241 * using the gcc __builtin_prefetch(), the prefetch will
1242 * fall to the bottom of the loop -- it won't stick up
1243 * at the top of the loop, just after the vld4.
1244 */
1245
1246 // sr = sr - (sr>>5) + d
1247 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1248 dr = vaddl_u8(sr, d);
1249
1250 // sb = sb - (sb>>5) + d
1251 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1252 db = vaddl_u8(sb, d);
1253
1254 // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1255 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1256 dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1257
1258 // pack high bits of each into 565 format (rgb, b is lsb)
1259 dst8 = vshrq_n_u16(db, 3);
1260 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1261 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1262
1263 // store it
1264 vst1q_u16(dst, dst8);
1265
1266 #if defined(DEBUG_S32_OPAQUE_DITHER)
1267 // always good to know if we generated good results
1268 {
1269 int i, myx = x, myy = y;
1270 DITHER_565_SCAN(myy);
1271 for (i=0;i<UNROLL;i++) {
1272 // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1273 SkPMColor c = src[i-8];
1274 unsigned dither = DITHER_VALUE(myx);
1275 uint16_t val = SkDitherRGB32To565(c, dither);
1276 if (val != dst[i]) {
1277 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1278 c, dither, val, dst[i], dstart[i]);
1279 }
1280 DITHER_INC_X(myx);
1281 }
1282 }
1283 #endif
1284
1285 dst += UNROLL;
1286 // we don't need to increment src as the asm above has already done it
1287 count -= UNROLL;
1288 x += UNROLL; // probably superfluous
1289 }
1290 }
1291 #undef UNROLL
1292
1293 // residuals
1294 if (count > 0) {
1295 DITHER_565_SCAN(y);
1296 do {
1297 SkPMColor c = *src++;
1298 SkPMColorAssert(c);
1299 SkASSERT(SkGetPackedA32(c) == 255);
1300
1301 unsigned dither = DITHER_VALUE(x);
1302 *dst++ = SkDitherRGB32To565(c, dither);
1303 DITHER_INC_X(x);
1304 } while (--count != 0);
1305 }
1306 }
1307
Color32_arm_neon(SkPMColor * dst,const SkPMColor * src,int count,SkPMColor color)1308 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1309 SkPMColor color) {
1310 if (count <= 0) {
1311 return;
1312 }
1313
1314 if (0 == color) {
1315 if (src != dst) {
1316 memcpy(dst, src, count * sizeof(SkPMColor));
1317 }
1318 return;
1319 }
1320
1321 unsigned colorA = SkGetPackedA32(color);
1322 if (255 == colorA) {
1323 sk_memset32(dst, color, count);
1324 } else {
1325 unsigned scale = 256 - SkAlpha255To256(colorA);
1326
1327 if (count >= 8) {
1328 // at the end of this assembly, count will have been decremented
1329 // to a negative value. That is, if count mod 8 = x, it will be
1330 // -8 +x coming out.
1331 asm volatile (
1332 PLD128(src, 0)
1333
1334 "vdup.32 q0, %[color] \n\t"
1335
1336 PLD128(src, 128)
1337
1338 // scale numerical interval [0-255], so load as 8 bits
1339 "vdup.8 d2, %[scale] \n\t"
1340
1341 PLD128(src, 256)
1342
1343 "subs %[count], %[count], #8 \n\t"
1344
1345 PLD128(src, 384)
1346
1347 "Loop_Color32: \n\t"
1348
1349 // load src color, 8 pixels, 4 64 bit registers
1350 // (and increment src).
1351 "vld1.32 {d4-d7}, [%[src]]! \n\t"
1352
1353 PLD128(src, 384)
1354
1355 // multiply long by scale, 64 bits at a time,
1356 // destination into a 128 bit register.
1357 "vmull.u8 q4, d4, d2 \n\t"
1358 "vmull.u8 q5, d5, d2 \n\t"
1359 "vmull.u8 q6, d6, d2 \n\t"
1360 "vmull.u8 q7, d7, d2 \n\t"
1361
1362 // shift the 128 bit registers, containing the 16
1363 // bit scaled values back to 8 bits, narrowing the
1364 // results to 64 bit registers.
1365 "vshrn.i16 d8, q4, #8 \n\t"
1366 "vshrn.i16 d9, q5, #8 \n\t"
1367 "vshrn.i16 d10, q6, #8 \n\t"
1368 "vshrn.i16 d11, q7, #8 \n\t"
1369
1370 // adding back the color, using 128 bit registers.
1371 "vadd.i8 q6, q4, q0 \n\t"
1372 "vadd.i8 q7, q5, q0 \n\t"
1373
1374 // store back the 8 calculated pixels (2 128 bit
1375 // registers), and increment dst.
1376 "vst1.32 {d12-d15}, [%[dst]]! \n\t"
1377
1378 "subs %[count], %[count], #8 \n\t"
1379 "bge Loop_Color32 \n\t"
1380 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
1381 : [color] "r" (color), [scale] "r" (scale)
1382 : "cc", "memory",
1383 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1384 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
1385 );
1386 // At this point, if we went through the inline assembly, count is
1387 // a negative value:
1388 // if the value is -8, there is no pixel left to process.
1389 // if the value is -7, there is one pixel left to process
1390 // ...
1391 // And'ing it with 7 will give us the number of pixels
1392 // left to process.
1393 count = count & 0x7;
1394 }
1395
1396 while (count > 0) {
1397 *dst = color + SkAlphaMulQ(*src, scale);
1398 src += 1;
1399 dst += 1;
1400 count--;
1401 }
1402 }
1403 }
1404
1405 ///////////////////////////////////////////////////////////////////////////////
1406
1407 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1408 // no dither
1409 // NOTE: For the S32_D565_Blend function below, we don't have a special
1410 // version that assumes that each source pixel is opaque. But our
1411 // S32A is still faster than the default, so use it.
1412 S32_D565_Opaque_neon,
1413 S32A_D565_Blend_neon, // really S32_D565_Blend
1414 S32A_D565_Opaque_neon,
1415 S32A_D565_Blend_neon,
1416
1417 // dither
1418 S32_D565_Opaque_Dither_neon,
1419 S32_D565_Blend_Dither_neon,
1420 S32A_D565_Opaque_Dither_neon,
1421 NULL, // S32A_D565_Blend_Dither
1422 };
1423
1424 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1425 NULL, // S32_Opaque,
1426 S32_Blend_BlitRow32_neon, // S32_Blend,
1427 /*
1428 * We have two choices for S32A_Opaque procs. The one reads the src alpha
1429 * value and attempts to optimize accordingly. The optimization is
1430 * sensitive to the source content and is not a win in all cases. For
1431 * example, if there are a lot of transitions between the alpha states,
1432 * the performance will almost certainly be worse. However, for many
1433 * common cases the performance is equivalent or better than the standard
1434 * case where we do not inspect the src alpha.
1435 */
1436 #if SK_A32_SHIFT == 24
1437 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1438 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1439 #else
1440 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1441 #endif
1442 S32A_Blend_BlitRow32_neon // S32A_Blend
1443 };
1444