1 /*
2 **
3 ** Copyright 2009, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 ** http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17
18 #ifdef ANDROID
19 #include <machine/cpu-features.h>
20 #endif
21
22 #include "SkBlitRow.h"
23 #include "SkColorPriv.h"
24 #include "SkDither.h"
25
26 #if defined(__ARM_HAVE_NEON)
27 #include <arm_neon.h>
28 #endif
29
30 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
S32A_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)31 static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
32 const SkPMColor* SK_RESTRICT src, int count,
33 U8CPU alpha, int /*x*/, int /*y*/) {
34 SkASSERT(255 == alpha);
35
36 if (count >= 8) {
37 uint16_t* SK_RESTRICT keep_dst;
38
39 asm volatile (
40 "ands ip, %[count], #7 \n\t"
41 "vmov.u8 d31, #1<<7 \n\t"
42 "vld1.16 {q12}, [%[dst]] \n\t"
43 "vld4.8 {d0-d3}, [%[src]] \n\t"
44 "moveq ip, #8 \n\t"
45 "mov %[keep_dst], %[dst] \n\t"
46
47 "add %[src], %[src], ip, LSL#2 \n\t"
48 "add %[dst], %[dst], ip, LSL#1 \n\t"
49 "subs %[count], %[count], ip \n\t"
50 "b 9f \n\t"
51 // LOOP
52 "2: \n\t"
53
54 "vld1.16 {q12}, [%[dst]]! \n\t"
55 "vld4.8 {d0-d3}, [%[src]]! \n\t"
56 "vst1.16 {q10}, [%[keep_dst]] \n\t"
57 "sub %[keep_dst], %[dst], #8*2 \n\t"
58 "subs %[count], %[count], #8 \n\t"
59 "9: \n\t"
60 "pld [%[dst],#32] \n\t"
61 // expand 0565 q12 to 8888 {d4-d7}
62 "vmovn.u16 d4, q12 \n\t"
63 "vshr.u16 q11, q12, #5 \n\t"
64 "vshr.u16 q10, q12, #6+5 \n\t"
65 "vmovn.u16 d5, q11 \n\t"
66 "vmovn.u16 d6, q10 \n\t"
67 "vshl.u8 d4, d4, #3 \n\t"
68 "vshl.u8 d5, d5, #2 \n\t"
69 "vshl.u8 d6, d6, #3 \n\t"
70
71 "vmovl.u8 q14, d31 \n\t"
72 "vmovl.u8 q13, d31 \n\t"
73 "vmovl.u8 q12, d31 \n\t"
74
75 // duplicate in 4/2/1 & 8pix vsns
76 "vmvn.8 d30, d3 \n\t"
77 "vmlal.u8 q14, d30, d6 \n\t"
78 "vmlal.u8 q13, d30, d5 \n\t"
79 "vmlal.u8 q12, d30, d4 \n\t"
80 "vshr.u16 q8, q14, #5 \n\t"
81 "vshr.u16 q9, q13, #6 \n\t"
82 "vaddhn.u16 d6, q14, q8 \n\t"
83 "vshr.u16 q8, q12, #5 \n\t"
84 "vaddhn.u16 d5, q13, q9 \n\t"
85 "vqadd.u8 d6, d6, d0 \n\t" // moved up
86 "vaddhn.u16 d4, q12, q8 \n\t"
87 // intentionally don't calculate alpha
88 // result in d4-d6
89
90 "vqadd.u8 d5, d5, d1 \n\t"
91 "vqadd.u8 d4, d4, d2 \n\t"
92
93 // pack 8888 {d4-d6} to 0565 q10
94 "vshll.u8 q10, d6, #8 \n\t"
95 "vshll.u8 q3, d5, #8 \n\t"
96 "vshll.u8 q2, d4, #8 \n\t"
97 "vsri.u16 q10, q3, #5 \n\t"
98 "vsri.u16 q10, q2, #11 \n\t"
99
100 "bne 2b \n\t"
101
102 "1: \n\t"
103 "vst1.16 {q10}, [%[keep_dst]] \n\t"
104 : [count] "+r" (count)
105 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
106 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
107 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
108 "d30","d31"
109 );
110 }
111 else
112 { // handle count < 8
113 uint16_t* SK_RESTRICT keep_dst;
114
115 asm volatile (
116 "vmov.u8 d31, #1<<7 \n\t"
117 "mov %[keep_dst], %[dst] \n\t"
118
119 "tst %[count], #4 \n\t"
120 "beq 14f \n\t"
121 "vld1.16 {d25}, [%[dst]]! \n\t"
122 "vld1.32 {q1}, [%[src]]! \n\t"
123
124 "14: \n\t"
125 "tst %[count], #2 \n\t"
126 "beq 12f \n\t"
127 "vld1.32 {d24[1]}, [%[dst]]! \n\t"
128 "vld1.32 {d1}, [%[src]]! \n\t"
129
130 "12: \n\t"
131 "tst %[count], #1 \n\t"
132 "beq 11f \n\t"
133 "vld1.16 {d24[1]}, [%[dst]]! \n\t"
134 "vld1.32 {d0[1]}, [%[src]]! \n\t"
135
136 "11: \n\t"
137 // unzips achieve the same as a vld4 operation
138 "vuzpq.u16 q0, q1 \n\t"
139 "vuzp.u8 d0, d1 \n\t"
140 "vuzp.u8 d2, d3 \n\t"
141 // expand 0565 q12 to 8888 {d4-d7}
142 "vmovn.u16 d4, q12 \n\t"
143 "vshr.u16 q11, q12, #5 \n\t"
144 "vshr.u16 q10, q12, #6+5 \n\t"
145 "vmovn.u16 d5, q11 \n\t"
146 "vmovn.u16 d6, q10 \n\t"
147 "vshl.u8 d4, d4, #3 \n\t"
148 "vshl.u8 d5, d5, #2 \n\t"
149 "vshl.u8 d6, d6, #3 \n\t"
150
151 "vmovl.u8 q14, d31 \n\t"
152 "vmovl.u8 q13, d31 \n\t"
153 "vmovl.u8 q12, d31 \n\t"
154
155 // duplicate in 4/2/1 & 8pix vsns
156 "vmvn.8 d30, d3 \n\t"
157 "vmlal.u8 q14, d30, d6 \n\t"
158 "vmlal.u8 q13, d30, d5 \n\t"
159 "vmlal.u8 q12, d30, d4 \n\t"
160 "vshr.u16 q8, q14, #5 \n\t"
161 "vshr.u16 q9, q13, #6 \n\t"
162 "vaddhn.u16 d6, q14, q8 \n\t"
163 "vshr.u16 q8, q12, #5 \n\t"
164 "vaddhn.u16 d5, q13, q9 \n\t"
165 "vqadd.u8 d6, d6, d0 \n\t" // moved up
166 "vaddhn.u16 d4, q12, q8 \n\t"
167 // intentionally don't calculate alpha
168 // result in d4-d6
169
170 "vqadd.u8 d5, d5, d1 \n\t"
171 "vqadd.u8 d4, d4, d2 \n\t"
172
173 // pack 8888 {d4-d6} to 0565 q10
174 "vshll.u8 q10, d6, #8 \n\t"
175 "vshll.u8 q3, d5, #8 \n\t"
176 "vshll.u8 q2, d4, #8 \n\t"
177 "vsri.u16 q10, q3, #5 \n\t"
178 "vsri.u16 q10, q2, #11 \n\t"
179
180 // store
181 "tst %[count], #4 \n\t"
182 "beq 24f \n\t"
183 "vst1.16 {d21}, [%[keep_dst]]! \n\t"
184
185 "24: \n\t"
186 "tst %[count], #2 \n\t"
187 "beq 22f \n\t"
188 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t"
189
190 "22: \n\t"
191 "tst %[count], #1 \n\t"
192 "beq 21f \n\t"
193 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t"
194
195 "21: \n\t"
196 : [count] "+r" (count)
197 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
198 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
199 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
200 "d30","d31"
201 );
202 }
203 }
204
S32A_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)205 static void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
206 const SkPMColor* SK_RESTRICT src, int count,
207 U8CPU alpha, int /*x*/, int /*y*/) {
208
209 U8CPU alpha_for_asm = alpha;
210
211 asm volatile (
212 /* This code implements a Neon version of S32A_D565_Blend. The output differs from
213 * the original in two respects:
214 * 1. The results have a few mismatches compared to the original code. These mismatches
215 * never exceed 1. It's possible to improve accuracy vs. a floating point
216 * implementation by introducing rounding right shifts (vrshr) for the final stage.
217 * Rounding is not present in the code below, because although results would be closer
218 * to a floating point implementation, the number of mismatches compared to the
219 * original code would be far greater.
220 * 2. On certain inputs, the original code can overflow, causing colour channels to
221 * mix. Although the Neon code can also overflow, it doesn't allow one colour channel
222 * to affect another.
223 */
224
225 #if 1
226 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
227 "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256
228 #else
229 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256
230 #endif
231 "vmov.u16 q3, #255 \n\t" // set up constant
232 "movs r4, %[count], lsr #3 \n\t" // calc. count>>3
233 "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon
234 "beq 2f \n\t" // if count8 == 0, exit
235 "vmov.u16 q15, #0x1f \n\t" // set up blue mask
236
237 "1: \n\t"
238 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels
239 "subs r4, r4, #1 \n\t" // decrement loop counter
240 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels
241 // and deinterleave
242
243 "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes
244 "vand q10, q0, q15 \n\t" // extract blue
245 "vshr.u16 q8, q0, #11 \n\t" // extract red
246 "vshr.u16 q9, q9, #10 \n\t" // extract green
247 // dstrgb = {q8, q9, q10}
248
249 "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range
250 "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range
251 "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range
252
253 "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits
254 "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits
255 "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits
256 "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits
257 // srcrgba = {q11, q12, q13, q14}
258
259 "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale
260 "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale
261 "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale
262 "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale
263
264 "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8
265 "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8)
266 // dst_scale = q2
267
268 "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale
269 "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale
270 "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale
271
272 #if 1
273 // trying for a better match with SkDiv255Round(a)
274 // C alg is: a+=128; (a+a>>8)>>8
275 // we'll use just a rounding shift [q2 is available for scratch]
276 "vrshr.u16 q11, q11, #8 \n\t" // shift down red
277 "vrshr.u16 q12, q12, #8 \n\t" // shift down green
278 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue
279 #else
280 // arm's original "truncating divide by 256"
281 "vshr.u16 q11, q11, #8 \n\t" // shift down red
282 "vshr.u16 q12, q12, #8 \n\t" // shift down green
283 "vshr.u16 q13, q13, #8 \n\t" // shift down blue
284 #endif
285
286 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue
287 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue
288 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr
289
290 "bne 1b \n\t" // if counter != 0, loop
291 "2: \n\t" // exit
292
293 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
294 :
295 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
296 );
297
298 count &= 7;
299 if (count > 0) {
300 do {
301 SkPMColor sc = *src++;
302 if (sc) {
303 uint16_t dc = *dst;
304 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
305 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
306 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
307 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
308 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
309 }
310 dst += 1;
311 } while (--count != 0);
312 }
313 }
314
315 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
316 * each dither value is spaced out into byte lanes, and repeated
317 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
318 * start of each row.
319 */
320 static const uint8_t gDitherMatrix_Neon[48] = {
321 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
322 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
323 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
324 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
325
326 };
327
S32_D565_Blend_Dither_neon(uint16_t * dst,const SkPMColor * src,int count,U8CPU alpha,int x,int y)328 static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
329 int count, U8CPU alpha, int x, int y)
330 {
331 /* select row and offset for dither array */
332 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
333
334 /* rescale alpha to range 0 - 256 */
335 int scale = SkAlpha255To256(alpha);
336
337 asm volatile (
338 "vld1.8 {d31}, [%[dstart]] \n\t" // load dither values
339 "vshr.u8 d30, d31, #1 \n\t" // calc. green dither values
340 "vdup.16 d6, %[scale] \n\t" // duplicate scale into neon reg
341 "vmov.i8 d29, #0x3f \n\t" // set up green mask
342 "vmov.i8 d28, #0x1f \n\t" // set up blue mask
343 "1: \n\t"
344 "vld4.8 {d0, d1, d2, d3}, [%[src]]! \n\t" // load 8 pixels and split into argb
345 "vshr.u8 d22, d0, #5 \n\t" // calc. red >> 5
346 "vshr.u8 d23, d1, #6 \n\t" // calc. green >> 6
347 "vshr.u8 d24, d2, #5 \n\t" // calc. blue >> 5
348 "vaddl.u8 q8, d0, d31 \n\t" // add in dither to red and widen
349 "vaddl.u8 q9, d1, d30 \n\t" // add in dither to green and widen
350 "vaddl.u8 q10, d2, d31 \n\t" // add in dither to blue and widen
351 "vsubw.u8 q8, q8, d22 \n\t" // sub shifted red from result
352 "vsubw.u8 q9, q9, d23 \n\t" // sub shifted green from result
353 "vsubw.u8 q10, q10, d24 \n\t" // sub shifted blue from result
354 "vshrn.i16 d22, q8, #3 \n\t" // shift right and narrow to 5 bits
355 "vshrn.i16 d23, q9, #2 \n\t" // shift right and narrow to 6 bits
356 "vshrn.i16 d24, q10, #3 \n\t" // shift right and narrow to 5 bits
357 // load 8 pixels from dst, extract rgb
358 "vld1.16 {d0, d1}, [%[dst]] \n\t" // load 8 pixels
359 "vshrn.i16 d17, q0, #5 \n\t" // shift green down to bottom 6 bits
360 "vmovn.i16 d18, q0 \n\t" // narrow to get blue as bytes
361 "vshr.u16 q0, q0, #11 \n\t" // shift down to extract red
362 "vand d17, d17, d29 \n\t" // and green with green mask
363 "vand d18, d18, d28 \n\t" // and blue with blue mask
364 "vmovn.i16 d16, q0 \n\t" // narrow to get red as bytes
365 // src = {d22 (r), d23 (g), d24 (b)}
366 // dst = {d16 (r), d17 (g), d18 (b)}
367 // subtract dst from src and widen
368 "vsubl.s8 q0, d22, d16 \n\t" // subtract red src from dst
369 "vsubl.s8 q1, d23, d17 \n\t" // subtract green src from dst
370 "vsubl.s8 q2, d24, d18 \n\t" // subtract blue src from dst
371 // multiply diffs by scale and shift
372 "vmul.i16 q0, q0, d6[0] \n\t" // multiply red by scale
373 "vmul.i16 q1, q1, d6[0] \n\t" // multiply blue by scale
374 "vmul.i16 q2, q2, d6[0] \n\t" // multiply green by scale
375 "subs %[count], %[count], #8 \n\t" // decrement loop counter
376 "vshrn.i16 d0, q0, #8 \n\t" // shift down red by 8 and narrow
377 "vshrn.i16 d2, q1, #8 \n\t" // shift down green by 8 and narrow
378 "vshrn.i16 d4, q2, #8 \n\t" // shift down blue by 8 and narrow
379 // add dst to result
380 "vaddl.s8 q0, d0, d16 \n\t" // add dst to red
381 "vaddl.s8 q1, d2, d17 \n\t" // add dst to green
382 "vaddl.s8 q2, d4, d18 \n\t" // add dst to blue
383 // put result into 565 format
384 "vsli.i16 q2, q1, #5 \n\t" // shift up green and insert into blue
385 "vsli.i16 q2, q0, #11 \n\t" // shift up red and insert into blue
386 "vst1.16 {d4, d5}, [%[dst]]! \n\t" // store result
387 "bgt 1b \n\t" // loop if count > 0
388 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
389 : [dstart] "r" (dstart), [scale] "r" (scale)
390 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
391 );
392
393 DITHER_565_SCAN(y);
394
395 while((count & 7) > 0)
396 {
397 SkPMColor c = *src++;
398
399 int dither = DITHER_VALUE(x);
400 int sr = SkGetPackedR32(c);
401 int sg = SkGetPackedG32(c);
402 int sb = SkGetPackedB32(c);
403 sr = SkDITHER_R32To565(sr, dither);
404 sg = SkDITHER_G32To565(sg, dither);
405 sb = SkDITHER_B32To565(sb, dither);
406
407 uint16_t d = *dst;
408 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
409 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
410 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
411 DITHER_INC_X(x);
412 count--;
413 }
414 }
415
416 #define S32A_D565_Opaque_PROC S32A_D565_Opaque_neon
417 #define S32A_D565_Blend_PROC S32A_D565_Blend_neon
418 #define S32_D565_Blend_Dither_PROC S32_D565_Blend_Dither_neon
419 #else
420 #define S32A_D565_Opaque_PROC NULL
421 #define S32A_D565_Blend_PROC NULL
422 #define S32_D565_Blend_Dither_PROC NULL
423 #endif
424
425 /* Don't have a special version that assumes each src is opaque, but our S32A
426 is still faster than the default, so use it here
427 */
428 #define S32_D565_Opaque_PROC S32A_D565_Opaque_PROC
429 #define S32_D565_Blend_PROC S32A_D565_Blend_PROC
430
431 ///////////////////////////////////////////////////////////////////////////////
432
433 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
434
S32A_Opaque_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)435 static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
436 const SkPMColor* SK_RESTRICT src,
437 int count, U8CPU alpha) {
438
439 SkASSERT(255 == alpha);
440 if (count > 0) {
441
442
443 uint8x8_t alpha_mask;
444
445 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
446 alpha_mask = vld1_u8(alpha_mask_setup);
447
448 /* do the NEON unrolled code */
449 #define UNROLL 4
450 while (count >= UNROLL) {
451 uint8x8_t src_raw, dst_raw, dst_final;
452 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
453
454 /* get the source */
455 src_raw = vreinterpret_u8_u32(vld1_u32(src));
456 #if UNROLL > 2
457 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
458 #endif
459
460 /* get and hold the dst too */
461 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
462 #if UNROLL > 2
463 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
464 #endif
465
466 /* 1st and 2nd bits of the unrolling */
467 {
468 uint8x8_t dst_cooked;
469 uint16x8_t dst_wide;
470 uint8x8_t alpha_narrow;
471 uint16x8_t alpha_wide;
472
473 /* get the alphas spread out properly */
474 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
475 #if 1
476 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
477 /* we collapsed (255-a)+1 ... */
478 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
479 #else
480 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
481 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
482 #endif
483
484 /* spread the dest */
485 dst_wide = vmovl_u8(dst_raw);
486
487 /* alpha mul the dest */
488 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
489 dst_cooked = vshrn_n_u16(dst_wide, 8);
490
491 /* sum -- ignoring any byte lane overflows */
492 dst_final = vadd_u8(src_raw, dst_cooked);
493 }
494
495 #if UNROLL > 2
496 /* the 3rd and 4th bits of our unrolling */
497 {
498 uint8x8_t dst_cooked;
499 uint16x8_t dst_wide;
500 uint8x8_t alpha_narrow;
501 uint16x8_t alpha_wide;
502
503 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
504 #if 1
505 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
506 /* we collapsed (255-a)+1 ... */
507 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
508 #else
509 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
510 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
511 #endif
512
513 /* spread the dest */
514 dst_wide = vmovl_u8(dst_raw_2);
515
516 /* alpha mul the dest */
517 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
518 dst_cooked = vshrn_n_u16(dst_wide, 8);
519
520 /* sum -- ignoring any byte lane overflows */
521 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
522 }
523 #endif
524
525 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
526 #if UNROLL > 2
527 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
528 #endif
529
530 src += UNROLL;
531 dst += UNROLL;
532 count -= UNROLL;
533 }
534 #undef UNROLL
535
536 /* do any residual iterations */
537 while (--count >= 0) {
538 #ifdef TEST_SRC_ALPHA
539 SkPMColor sc = *src;
540 if (sc) {
541 unsigned srcA = SkGetPackedA32(sc);
542 SkPMColor result = sc;
543 if (srcA != 255) {
544 result = SkPMSrcOver(sc, *dst);
545 }
546 *dst = result;
547 }
548 #else
549 *dst = SkPMSrcOver(*src, *dst);
550 #endif
551 src += 1;
552 dst += 1;
553 }
554 }
555 }
556
557 #define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_neon
558
559 #else
560
561 #ifdef TEST_SRC_ALPHA
562 #error The ARM asm version of S32A_Opaque_BlitRow32 does not support TEST_SRC_ALPHA
563 #endif
564
S32A_Opaque_BlitRow32_arm(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)565 static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
566 const SkPMColor* SK_RESTRICT src,
567 int count, U8CPU alpha) {
568
569 SkASSERT(255 == alpha);
570
571 /* Does not support the TEST_SRC_ALPHA case */
572 asm volatile (
573 "cmp %[count], #0 \n\t" /* comparing count with 0 */
574 "beq 3f \n\t" /* if zero exit */
575
576 "mov ip, #0xff \n\t" /* load the 0xff mask in ip */
577 "orr ip, ip, ip, lsl #16 \n\t" /* convert it to 0xff00ff in ip */
578
579 "cmp %[count], #2 \n\t" /* compare count with 2 */
580 "blt 2f \n\t" /* if less than 2 -> single loop */
581
582 /* Double Loop */
583 "1: \n\t" /* <double loop> */
584 "ldm %[src]!, {r5,r6} \n\t" /* load the src(s) at r5-r6 */
585 "ldm %[dst], {r7,r8} \n\t" /* loading dst(s) into r7-r8 */
586 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */
587
588 /* ----------- */
589 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */
590 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */
591 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */
592
593 "mul r9, r9, r4 \n\t" /* br = br * scale */
594 "mul r10, r10, r4 \n\t" /* ag = ag * scale */
595 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */
596
597 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */
598 "lsr r4, r6, #24 \n\t" /* extracting the alpha from source and storing it to r4 */
599 "orr r7, r9, r10 \n\t" /* br | ag*/
600
601 "add r7, r5, r7 \n\t" /* dst = src + calc dest(r7) */
602 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 255 -> r4=scale */
603
604 /* ----------- */
605 "and r9, ip, r8 \n\t" /* r9 = br masked by ip */
606
607 "and r10, ip, r8, lsr #8 \n\t" /* r10 = ag masked by ip */
608 "mul r9, r9, r4 \n\t" /* br = br * scale */
609 "sub %[count], %[count], #2 \n\t"
610 "mul r10, r10, r4 \n\t" /* ag = ag * scale */
611
612 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */
613 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */
614 "cmp %[count], #1 \n\t" /* comparing count with 1 */
615 "orr r8, r9, r10 \n\t" /* br | ag */
616
617 "add r8, r6, r8 \n\t" /* dst = src + calc dest(r8) */
618
619 /* ----------------- */
620 "stm %[dst]!, {r7,r8} \n\t" /* *dst = r7, increment dst by two (each times 4) */
621 /* ----------------- */
622
623 "bgt 1b \n\t" /* if greater than 1 -> reloop */
624 "blt 3f \n\t" /* if less than 1 -> exit */
625
626 /* Single Loop */
627 "2: \n\t" /* <single loop> */
628 "ldr r5, [%[src]], #4 \n\t" /* load the src pointer into r5 r5=src */
629 "ldr r7, [%[dst]] \n\t" /* loading dst into r7 */
630 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */
631
632 /* ----------- */
633 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */
634 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */
635
636 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */
637 "mul r9, r9, r4 \n\t" /* br = br * scale */
638 "mul r10, r10, r4 \n\t" /* ag = ag * scale */
639 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */
640
641 "and r10, r10, ip, lsl #8 \n\t" /* mask ag */
642 "orr r7, r9, r10 \n\t" /* br | ag */
643
644 "add r7, r5, r7 \n\t" /* *dst = src + calc dest(r7) */
645
646 /* ----------------- */
647 "str r7, [%[dst]], #4 \n\t" /* *dst = r7, increment dst by one (times 4) */
648 /* ----------------- */
649
650 "3: \n\t" /* <exit> */
651 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
652 :
653 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory"
654 );
655 }
656 #define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_arm
657 #endif
658
659 /*
660 * ARM asm version of S32A_Blend_BlitRow32
661 */
S32A_Blend_BlitRow32_arm(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)662 static void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
663 const SkPMColor* SK_RESTRICT src,
664 int count, U8CPU alpha) {
665 asm volatile (
666 "cmp %[count], #0 \n\t" /* comparing count with 0 */
667 "beq 3f \n\t" /* if zero exit */
668
669 "mov r12, #0xff \n\t" /* load the 0xff mask in r12 */
670 "orr r12, r12, r12, lsl #16 \n\t" /* convert it to 0xff00ff in r12 */
671
672 /* src1,2_scale */
673 "add %[alpha], %[alpha], #1 \n\t" /* loading %[alpha]=src_scale=alpha+1 */
674
675 "cmp %[count], #2 \n\t" /* comparing count with 2 */
676 "blt 2f \n\t" /* if less than 2 -> single loop */
677
678 /* Double Loop */
679 "1: \n\t" /* <double loop> */
680 "ldm %[src]!, {r5, r6} \n\t" /* loading src pointers into r5 and r6 */
681 "ldm %[dst], {r7, r8} \n\t" /* loading dst pointers into r7 and r8 */
682
683 /* dst1_scale and dst2_scale*/
684 "lsr r9, r5, #24 \n\t" /* src >> 24 */
685 "lsr r10, r6, #24 \n\t" /* src >> 24 */
686 "smulbb r9, r9, %[alpha] \n\t" /* r9 = SkMulS16 r9 with src_scale */
687 "smulbb r10, r10, %[alpha] \n\t" /* r10 = SkMulS16 r10 with src_scale */
688 "lsr r9, r9, #8 \n\t" /* r9 >> 8 */
689 "lsr r10, r10, #8 \n\t" /* r10 >> 8 */
690 "rsb r9, r9, #256 \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */
691 "rsb r10, r10, #256 \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */
692
693 /* ---------------------- */
694
695 /* src1, src1_scale */
696 "and r11, r12, r5, lsr #8 \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */
697 "and r4, r12, r5 \n\t" /* rb = r4 = r5 masked by r12 */
698 "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */
699 "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */
700 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
701 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */
702 "orr r5, r11, r4 \n\t" /* r5 = (src1, src_scale) */
703
704 /* dst1, dst1_scale */
705 "and r11, r12, r7, lsr #8 \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */
706 "and r4, r12, r7 \n\t" /* rb = r4 = r7 masked by r12 */
707 "mul r11, r11, r9 \n\t" /* ag = r11 times dst_scale (r9) */
708 "mul r4, r4, r9 \n\t" /* rb = r4 times dst_scale (r9) */
709 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
710 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */
711 "orr r9, r11, r4 \n\t" /* r9 = (dst1, dst_scale) */
712
713 /* ---------------------- */
714 "add r9, r5, r9 \n\t" /* *dst = src plus dst both scaled */
715 /* ---------------------- */
716
717 /* ====================== */
718
719 /* src2, src2_scale */
720 "and r11, r12, r6, lsr #8 \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */
721 "and r4, r12, r6 \n\t" /* rb = r4 = r6 masked by r12 */
722 "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */
723 "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */
724 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
725 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */
726 "orr r6, r11, r4 \n\t" /* r6 = (src2, src_scale) */
727
728 /* dst2, dst2_scale */
729 "and r11, r12, r8, lsr #8 \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */
730 "and r4, r12, r8 \n\t" /* rb = r4 = r8 masked by r12 */
731 "mul r11, r11, r10 \n\t" /* ag = r11 times dst_scale (r10) */
732 "mul r4, r4, r10 \n\t" /* rb = r4 times dst_scale (r6) */
733 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
734 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */
735 "orr r10, r11, r4 \n\t" /* r10 = (dst2, dst_scale) */
736
737 "sub %[count], %[count], #2 \n\t" /* decrease count by 2 */
738 /* ---------------------- */
739 "add r10, r6, r10 \n\t" /* *dst = src plus dst both scaled */
740 /* ---------------------- */
741 "cmp %[count], #1 \n\t" /* compare count with 1 */
742 /* ----------------- */
743 "stm %[dst]!, {r9, r10} \n\t" /* copy r9 and r10 to r7 and r8 respectively */
744 /* ----------------- */
745
746 "bgt 1b \n\t" /* if %[count] greater than 1 reloop */
747 "blt 3f \n\t" /* if %[count] less than 1 exit */
748 /* else get into the single loop */
749 /* Single Loop */
750 "2: \n\t" /* <single loop> */
751 "ldr r5, [%[src]], #4 \n\t" /* loading src pointer into r5: r5=src */
752 "ldr r7, [%[dst]] \n\t" /* loading dst pointer into r7: r7=dst */
753
754 "lsr r6, r5, #24 \n\t" /* src >> 24 */
755 "and r8, r12, r5, lsr #8 \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */
756 "smulbb r6, r6, %[alpha] \n\t" /* r6 = SkMulS16 with src_scale */
757 "and r9, r12, r5 \n\t" /* rb = r9 = r5 masked by r12 */
758 "lsr r6, r6, #8 \n\t" /* r6 >> 8 */
759 "mul r8, r8, %[alpha] \n\t" /* ag = r8 times scale */
760 "rsb r6, r6, #256 \n\t" /* r6 = 255 - r6 + 1 */
761
762 /* src, src_scale */
763 "mul r9, r9, %[alpha] \n\t" /* rb = r9 times scale */
764 "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
765 "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */
766 "orr r10, r8, r9 \n\t" /* r10 = (scr, src_scale) */
767
768 /* dst, dst_scale */
769 "and r8, r12, r7, lsr #8 \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */
770 "and r9, r12, r7 \n\t" /* rb = r9 = r7 masked by r12 */
771 "mul r8, r8, r6 \n\t" /* ag = r8 times scale (r6) */
772 "mul r9, r9, r6 \n\t" /* rb = r9 times scale (r6) */
773 "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
774 "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */
775 "orr r7, r8, r9 \n\t" /* r7 = (dst, dst_scale) */
776
777 "add r10, r7, r10 \n\t" /* *dst = src plus dst both scaled */
778
779 /* ----------------- */
780 "str r10, [%[dst]], #4 \n\t" /* *dst = r10, postincrement dst by one (times 4) */
781 /* ----------------- */
782
783 "3: \n\t" /* <exit> */
784 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha)
785 :
786 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory"
787 );
788
789 }
790 #define S32A_Blend_BlitRow32_PROC S32A_Blend_BlitRow32_arm
791
792 /* Neon version of S32_Blend_BlitRow32()
793 * portable version is in src/core/SkBlitRow_D32.cpp
794 */
795 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
S32_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)796 static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
797 const SkPMColor* SK_RESTRICT src,
798 int count, U8CPU alpha) {
799 SkASSERT(alpha <= 255);
800 if (count > 0) {
801 uint16_t src_scale = SkAlpha255To256(alpha);
802 uint16_t dst_scale = 256 - src_scale;
803
804 /* run them N at a time through the NEON unit */
805 /* note that each 1 is 4 bytes, each treated exactly the same,
806 * so we can work under that guise. We *do* know that the src&dst
807 * will be 32-bit aligned quantities, so we can specify that on
808 * the load/store ops and do a neon 'reinterpret' to get us to
809 * byte-sized (pun intended) pieces that we widen/multiply/shift
810 * we're limited at 128 bits in the wide ops, which is 8x16bits
811 * or a pair of 32 bit src/dsts.
812 */
813 /* we *could* manually unroll this loop so that we load 128 bits
814 * (as a pair of 64s) from each of src and dst, processing them
815 * in pieces. This might give us a little better management of
816 * the memory latency, but my initial attempts here did not
817 * produce an instruction stream that looked all that nice.
818 */
819 #define UNROLL 2
820 while (count >= UNROLL) {
821 uint8x8_t src_raw, dst_raw, dst_final;
822 uint16x8_t src_wide, dst_wide;
823
824 /* get 64 bits of src, widen it, multiply by src_scale */
825 src_raw = vreinterpret_u8_u32(vld1_u32(src));
826 src_wide = vmovl_u8(src_raw);
827 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
828 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
829
830 /* ditto with dst */
831 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
832 dst_wide = vmovl_u8(dst_raw);
833
834 /* combine add with dst multiply into mul-accumulate */
835 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
836
837 dst_final = vshrn_n_u16(dst_wide, 8);
838 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
839
840 src += UNROLL;
841 dst += UNROLL;
842 count -= UNROLL;
843 }
844 /* RBE: well, i don't like how gcc manages src/dst across the above
845 * loop it's constantly calculating src+bias, dst+bias and it only
846 * adjusts the real ones when we leave the loop. Not sure why
847 * it's "hoisting down" (hoisting implies above in my lexicon ;))
848 * the adjustments to src/dst/count, but it does...
849 * (might be SSA-style internal logic...
850 */
851
852 #if UNROLL == 2
853 if (count == 1) {
854 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
855 }
856 #else
857 if (count > 0) {
858 do {
859 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
860 src += 1;
861 dst += 1;
862 } while (--count > 0);
863 }
864 #endif
865
866 #undef UNROLL
867 }
868 }
869
870 #define S32_Blend_BlitRow32_PROC S32_Blend_BlitRow32_neon
871 #else
872 #define S32_Blend_BlitRow32_PROC NULL
873 #endif
874
875 ///////////////////////////////////////////////////////////////////////////////
876
877 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
878
879 #undef DEBUG_OPAQUE_DITHER
880
881 #if defined(DEBUG_OPAQUE_DITHER)
showme8(char * str,void * p,int len)882 static void showme8(char *str, void *p, int len)
883 {
884 static char buf[256];
885 char tbuf[32];
886 int i;
887 char *pc = (char*) p;
888 sprintf(buf,"%8s:", str);
889 for(i=0;i<len;i++) {
890 sprintf(tbuf, " %02x", pc[i]);
891 strcat(buf, tbuf);
892 }
893 SkDebugf("%s\n", buf);
894 }
showme16(char * str,void * p,int len)895 static void showme16(char *str, void *p, int len)
896 {
897 static char buf[256];
898 char tbuf[32];
899 int i;
900 uint16_t *pc = (uint16_t*) p;
901 sprintf(buf,"%8s:", str);
902 len = (len / sizeof(uint16_t)); /* passed as bytes */
903 for(i=0;i<len;i++) {
904 sprintf(tbuf, " %04x", pc[i]);
905 strcat(buf, tbuf);
906 }
907 SkDebugf("%s\n", buf);
908 }
909 #endif
910
S32A_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)911 static void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
912 const SkPMColor* SK_RESTRICT src,
913 int count, U8CPU alpha, int x, int y) {
914 SkASSERT(255 == alpha);
915
916 #define UNROLL 8
917
918 if (count >= UNROLL) {
919 uint8x8_t dbase;
920
921 #if defined(DEBUG_OPAQUE_DITHER)
922 uint16_t tmpbuf[UNROLL];
923 int td[UNROLL];
924 int tdv[UNROLL];
925 int ta[UNROLL];
926 int tap[UNROLL];
927 uint16_t in_dst[UNROLL];
928 int offset = 0;
929 int noisy = 0;
930 #endif
931
932 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
933 dbase = vld1_u8(dstart);
934
935 do {
936 uint8x8_t sr, sg, sb, sa, d;
937 uint16x8_t dst8, scale8, alpha8;
938 uint16x8_t dst_r, dst_g, dst_b;
939
940 #if defined(DEBUG_OPAQUE_DITHER)
941 /* calculate 8 elements worth into a temp buffer */
942 {
943 int my_y = y;
944 int my_x = x;
945 SkPMColor* my_src = (SkPMColor*)src;
946 uint16_t* my_dst = dst;
947 int i;
948
949 DITHER_565_SCAN(my_y);
950 for(i=0;i<UNROLL;i++) {
951 SkPMColor c = *my_src++;
952 SkPMColorAssert(c);
953 if (c) {
954 unsigned a = SkGetPackedA32(c);
955
956 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
957 tdv[i] = DITHER_VALUE(my_x);
958 ta[i] = a;
959 tap[i] = SkAlpha255To256(a);
960 td[i] = d;
961
962 unsigned sr = SkGetPackedR32(c);
963 unsigned sg = SkGetPackedG32(c);
964 unsigned sb = SkGetPackedB32(c);
965 sr = SkDITHER_R32_FOR_565(sr, d);
966 sg = SkDITHER_G32_FOR_565(sg, d);
967 sb = SkDITHER_B32_FOR_565(sb, d);
968
969 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
970 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
971 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
972 // now src and dst expanded are in g:11 r:10 x:1 b:10
973 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
974 td[i] = d;
975
976 } else {
977 tmpbuf[i] = *my_dst;
978 ta[i] = tdv[i] = td[i] = 0xbeef;
979 }
980 in_dst[i] = *my_dst;
981 my_dst += 1;
982 DITHER_INC_X(my_x);
983 }
984 }
985 #endif
986
987 /* source is in ABGR */
988 {
989 register uint8x8_t d0 asm("d0");
990 register uint8x8_t d1 asm("d1");
991 register uint8x8_t d2 asm("d2");
992 register uint8x8_t d3 asm("d3");
993
994 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
995 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
996 : "r" (src)
997 );
998 sr = d0; sg = d1; sb = d2; sa = d3;
999 }
1000
1001 /* calculate 'd', which will be 0..7 */
1002 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
1003 #if ANDROID
1004 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1005 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
1006 #else
1007 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
1008 #endif
1009 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
1010 d = vshrn_n_u16(alpha8, 8); /* narrowing too */
1011
1012 /* sr = sr - (sr>>5) + d */
1013 /* watching for 8-bit overflow. d is 0..7; risky range of
1014 * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1015 * safe as long as we do ((sr-sr>>5) + d) */
1016 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1017 sr = vadd_u8(sr, d);
1018
1019 /* sb = sb - (sb>>5) + d */
1020 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1021 sb = vadd_u8(sb, d);
1022
1023 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1024 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1025 sg = vadd_u8(sg, vshr_n_u8(d,1));
1026
1027 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
1028 dst8 = vld1q_u16(dst);
1029 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
1030 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
1031 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */
1032
1033 /* blend */
1034 #if 1
1035 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1036 /* originally 255-sa + 1 */
1037 scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1038 #else
1039 scale8 = vsubw_u8(vdupq_n_u16(255), sa);
1040 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
1041 #endif
1042
1043 #if 1
1044 /* combine the addq and mul, save 3 insns */
1045 scale8 = vshrq_n_u16(scale8, 3);
1046 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1047 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1048 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1049 #else
1050 /* known correct, but +3 insns over above */
1051 scale8 = vshrq_n_u16(scale8, 3);
1052 dst_b = vmulq_u16(dst_b, scale8);
1053 dst_g = vmulq_u16(dst_g, scale8);
1054 dst_r = vmulq_u16(dst_r, scale8);
1055
1056 /* combine */
1057 /* NB: vshll widens, need to preserve those bits */
1058 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
1059 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
1060 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
1061 #endif
1062
1063 /* repack to store */
1064 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
1065 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1066 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1067
1068 vst1q_u16(dst, dst8);
1069
1070 #if defined(DEBUG_OPAQUE_DITHER)
1071 /* verify my 8 elements match the temp buffer */
1072 {
1073 int i, bad=0;
1074 static int invocation;
1075
1076 for (i=0;i<UNROLL;i++)
1077 if (tmpbuf[i] != dst[i]) bad=1;
1078 if (bad) {
1079 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1080 invocation, offset);
1081 SkDebugf(" alpha 0x%x\n", alpha);
1082 for (i=0;i<UNROLL;i++)
1083 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1084 i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
1085 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
1086
1087 showme16("alpha8", &alpha8, sizeof(alpha8));
1088 showme16("scale8", &scale8, sizeof(scale8));
1089 showme8("d", &d, sizeof(d));
1090 showme16("dst8", &dst8, sizeof(dst8));
1091 showme16("dst_b", &dst_b, sizeof(dst_b));
1092 showme16("dst_g", &dst_g, sizeof(dst_g));
1093 showme16("dst_r", &dst_r, sizeof(dst_r));
1094 showme8("sb", &sb, sizeof(sb));
1095 showme8("sg", &sg, sizeof(sg));
1096 showme8("sr", &sr, sizeof(sr));
1097
1098 /* cop out */
1099 return;
1100 }
1101 offset += UNROLL;
1102 invocation++;
1103 }
1104 #endif
1105
1106 dst += UNROLL;
1107 src += UNROLL;
1108 count -= UNROLL;
1109 /* skip x += UNROLL, since it's unchanged mod-4 */
1110 } while (count >= UNROLL);
1111 }
1112 #undef UNROLL
1113
1114 /* residuals */
1115 if (count > 0) {
1116 DITHER_565_SCAN(y);
1117 do {
1118 SkPMColor c = *src++;
1119 SkPMColorAssert(c);
1120 if (c) {
1121 unsigned a = SkGetPackedA32(c);
1122
1123 // dither and alpha are just temporary variables to work-around
1124 // an ICE in debug.
1125 unsigned dither = DITHER_VALUE(x);
1126 unsigned alpha = SkAlpha255To256(a);
1127 int d = SkAlphaMul(dither, alpha);
1128
1129 unsigned sr = SkGetPackedR32(c);
1130 unsigned sg = SkGetPackedG32(c);
1131 unsigned sb = SkGetPackedB32(c);
1132 sr = SkDITHER_R32_FOR_565(sr, d);
1133 sg = SkDITHER_G32_FOR_565(sg, d);
1134 sb = SkDITHER_B32_FOR_565(sb, d);
1135
1136 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1137 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1138 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1139 // now src and dst expanded are in g:11 r:10 x:1 b:10
1140 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1141 }
1142 dst += 1;
1143 DITHER_INC_X(x);
1144 } while (--count != 0);
1145 }
1146 }
1147
1148 #define S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon
1149 #else
1150 #define S32A_D565_Opaque_Dither_PROC NULL
1151 #endif
1152
1153 ///////////////////////////////////////////////////////////////////////////////
1154
1155 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
1156 /* 2009/10/27: RBE says "a work in progress"; debugging says ok;
1157 * speedup untested, but ARM version is 26 insns/iteration and
1158 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
1159 * which is 10x the native version; that's pure instruction counts,
1160 * not accounting for any instruction or memory latencies.
1161 */
1162
1163 #undef DEBUG_S32_OPAQUE_DITHER
1164
S32_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1165 static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1166 const SkPMColor* SK_RESTRICT src,
1167 int count, U8CPU alpha, int x, int y) {
1168 SkASSERT(255 == alpha);
1169
1170 #define UNROLL 8
1171 if (count >= UNROLL) {
1172 uint8x8_t d;
1173 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1174 d = vld1_u8(dstart);
1175
1176 while (count >= UNROLL) {
1177 uint8x8_t sr, sg, sb, sa;
1178 uint16x8_t dr, dg, db, da;
1179 uint16x8_t dst8;
1180
1181 /* source is in ABGR ordering (R == lsb) */
1182 {
1183 register uint8x8_t d0 asm("d0");
1184 register uint8x8_t d1 asm("d1");
1185 register uint8x8_t d2 asm("d2");
1186 register uint8x8_t d3 asm("d3");
1187
1188 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1189 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1190 : "r" (src)
1191 );
1192 sr = d0; sg = d1; sb = d2; sa = d3;
1193 }
1194 /* XXX: if we want to prefetch, hide it in the above asm()
1195 * using the gcc __builtin_prefetch(), the prefetch will
1196 * fall to the bottom of the loop -- it won't stick up
1197 * at the top of the loop, just after the vld4.
1198 */
1199
1200 /* sr = sr - (sr>>5) + d */
1201 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1202 dr = vaddl_u8(sr, d);
1203
1204 /* sb = sb - (sb>>5) + d */
1205 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1206 db = vaddl_u8(sb, d);
1207
1208 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1209 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1210 dg = vaddl_u8(sg, vshr_n_u8(d,1));
1211 /* XXX: check that the "d>>1" here is hoisted */
1212
1213 /* pack high bits of each into 565 format (rgb, b is lsb) */
1214 dst8 = vshrq_n_u16(db, 3);
1215 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1216 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
1217
1218 /* store it */
1219 vst1q_u16(dst, dst8);
1220
1221 #if defined(DEBUG_S32_OPAQUE_DITHER)
1222 /* always good to know if we generated good results */
1223 {
1224 int i, myx = x, myy = y;
1225 DITHER_565_SCAN(myy);
1226 for (i=0;i<UNROLL;i++) {
1227 SkPMColor c = src[i];
1228 unsigned dither = DITHER_VALUE(myx);
1229 uint16_t val = SkDitherRGB32To565(c, dither);
1230 if (val != dst[i]) {
1231 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1232 c, dither, val, dst[i], dstart[i]);
1233 }
1234 DITHER_INC_X(myx);
1235 }
1236 }
1237 #endif
1238
1239 dst += UNROLL;
1240 src += UNROLL;
1241 count -= UNROLL;
1242 x += UNROLL; /* probably superfluous */
1243 }
1244 }
1245 #undef UNROLL
1246
1247 /* residuals */
1248 if (count > 0) {
1249 DITHER_565_SCAN(y);
1250 do {
1251 SkPMColor c = *src++;
1252 SkPMColorAssert(c);
1253 SkASSERT(SkGetPackedA32(c) == 255);
1254
1255 unsigned dither = DITHER_VALUE(x);
1256 *dst++ = SkDitherRGB32To565(c, dither);
1257 DITHER_INC_X(x);
1258 } while (--count != 0);
1259 }
1260 }
1261
1262 #define S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon
1263 #else
1264 #define S32_D565_Opaque_Dither_PROC NULL
1265 #endif
1266
1267 ///////////////////////////////////////////////////////////////////////////////
1268
1269 static const SkBlitRow::Proc platform_565_procs[] = {
1270 // no dither
1271 S32_D565_Opaque_PROC,
1272 S32_D565_Blend_PROC,
1273 S32A_D565_Opaque_PROC,
1274 S32A_D565_Blend_PROC,
1275
1276 // dither
1277 S32_D565_Opaque_Dither_PROC,
1278 S32_D565_Blend_Dither_PROC,
1279 S32A_D565_Opaque_Dither_PROC,
1280 NULL, // S32A_D565_Blend_Dither
1281 };
1282
1283 static const SkBlitRow::Proc platform_4444_procs[] = {
1284 // no dither
1285 NULL, // S32_D4444_Opaque,
1286 NULL, // S32_D4444_Blend,
1287 NULL, // S32A_D4444_Opaque,
1288 NULL, // S32A_D4444_Blend,
1289
1290 // dither
1291 NULL, // S32_D4444_Opaque_Dither,
1292 NULL, // S32_D4444_Blend_Dither,
1293 NULL, // S32A_D4444_Opaque_Dither,
1294 NULL, // S32A_D4444_Blend_Dither
1295 };
1296
1297 static const SkBlitRow::Proc32 platform_32_procs[] = {
1298 NULL, // S32_Opaque,
1299 S32_Blend_BlitRow32_PROC, // S32_Blend,
1300 S32A_Opaque_BlitRow32_PROC, // S32A_Opaque,
1301 S32A_Blend_BlitRow32_PROC // S32A_Blend
1302 };
1303
PlatformProcs4444(unsigned flags)1304 SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) {
1305 return platform_4444_procs[flags];
1306 }
1307
PlatformProcs565(unsigned flags)1308 SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
1309 return platform_565_procs[flags];
1310 }
1311
PlatformProcs32(unsigned flags)1312 SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
1313 return platform_32_procs[flags];
1314 }
1315
PlatformColorProc()1316 SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
1317 return NULL;
1318 }
1319
1320
PlatformProcs(SkBitmap::Config dstConfig,SkColor color)1321 SkBlitMask::Proc SkBlitMask::PlatformProcs(SkBitmap::Config dstConfig,
1322 SkColor color)
1323 {
1324 return NULL;
1325 }
1326