1 /* NEON optimized code (C) COPYRIGHT 2009 Motorola
2 *
3 * Use of this source code is governed by a BSD-style license that can be
4 * found in the LICENSE file.
5 */
6
7 #include "SkBitmapProcState.h"
8 #include "SkPerspIter.h"
9 #include "SkShader.h"
10 #include "SkUtils.h"
11
12 /* returns 0...(n-1) given any x (positive or negative).
13
14 As an example, if n (which is always positive) is 5...
15
16 x: -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8
17 returns: 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3
18 */
sk_int_mod(int x,int n)19 static inline int sk_int_mod(int x, int n) {
20 SkASSERT(n > 0);
21 if ((unsigned)x >= (unsigned)n) {
22 if (x < 0) {
23 x = n + ~(~x % n);
24 } else {
25 x = x % n;
26 }
27 }
28 return x;
29 }
30
31 void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
32 void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
33
34 #define MAKENAME(suffix) ClampX_ClampY ## suffix
35 #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max)
36 #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max)
37 #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
38 #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
39 #define CHECK_FOR_DECAL
40 #if defined(__ARM_HAVE_NEON)
41 #include "SkBitmapProcState_matrix_clamp.h"
42 #else
43 #include "SkBitmapProcState_matrix.h"
44 #endif
45
46 #define MAKENAME(suffix) RepeatX_RepeatY ## suffix
47 #define TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16)
48 #define TILEY_PROCF(fy, max) (((fy) & 0xFFFF) * ((max) + 1) >> 16)
49 #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
50 #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
51 #if defined(__ARM_HAVE_NEON)
52 #include "SkBitmapProcState_matrix_repeat.h"
53 #else
54 #include "SkBitmapProcState_matrix.h"
55 #endif
56
57 #define MAKENAME(suffix) GeneralXY ## suffix
58 #define PREAMBLE(state) SkBitmapProcState::FixedTileProc tileProcX = (state).fTileProcX; \
59 SkBitmapProcState::FixedTileProc tileProcY = (state).fTileProcY
60 #define PREAMBLE_PARAM_X , SkBitmapProcState::FixedTileProc tileProcX
61 #define PREAMBLE_PARAM_Y , SkBitmapProcState::FixedTileProc tileProcY
62 #define PREAMBLE_ARG_X , tileProcX
63 #define PREAMBLE_ARG_Y , tileProcY
64 #define TILEX_PROCF(fx, max) (tileProcX(fx) * ((max) + 1) >> 16)
65 #define TILEY_PROCF(fy, max) (tileProcY(fy) * ((max) + 1) >> 16)
66 #define TILEX_LOW_BITS(fx, max) ((tileProcX(fx) * ((max) + 1) >> 12) & 0xF)
67 #define TILEY_LOW_BITS(fy, max) ((tileProcY(fy) * ((max) + 1) >> 12) & 0xF)
68 #include "SkBitmapProcState_matrix.h"
69
fixed_clamp(SkFixed x)70 static inline U16CPU fixed_clamp(SkFixed x)
71 {
72 #ifdef SK_CPU_HAS_CONDITIONAL_INSTR
73 if (x >> 16)
74 x = 0xFFFF;
75 if (x < 0)
76 x = 0;
77 #else
78 if (x >> 16)
79 {
80 if (x < 0)
81 x = 0;
82 else
83 x = 0xFFFF;
84 }
85 #endif
86 return x;
87 }
88
fixed_repeat(SkFixed x)89 static inline U16CPU fixed_repeat(SkFixed x)
90 {
91 return x & 0xFFFF;
92 }
93
fixed_mirror(SkFixed x)94 static inline U16CPU fixed_mirror(SkFixed x)
95 {
96 SkFixed s = x << 15 >> 31;
97 // s is FFFFFFFF if we're on an odd interval, or 0 if an even interval
98 return (x ^ s) & 0xFFFF;
99 }
100
choose_tile_proc(unsigned m)101 static SkBitmapProcState::FixedTileProc choose_tile_proc(unsigned m)
102 {
103 if (SkShader::kClamp_TileMode == m)
104 return fixed_clamp;
105 if (SkShader::kRepeat_TileMode == m)
106 return fixed_repeat;
107 SkASSERT(SkShader::kMirror_TileMode == m);
108 return fixed_mirror;
109 }
110
int_clamp(int x,int n)111 static inline U16CPU int_clamp(int x, int n) {
112 #ifdef SK_CPU_HAS_CONDITIONAL_INSTR
113 if (x >= n)
114 x = n - 1;
115 if (x < 0)
116 x = 0;
117 #else
118 if ((unsigned)x >= (unsigned)n) {
119 if (x < 0) {
120 x = 0;
121 } else {
122 x = n - 1;
123 }
124 }
125 #endif
126 return x;
127 }
128
int_repeat(int x,int n)129 static inline U16CPU int_repeat(int x, int n) {
130 return sk_int_mod(x, n);
131 }
132
int_mirror(int x,int n)133 static inline U16CPU int_mirror(int x, int n) {
134 x = sk_int_mod(x, 2 * n);
135 if (x >= n) {
136 x = n + ~(x - n);
137 }
138 return x;
139 }
140
141 #if 0
142 static void test_int_tileprocs() {
143 for (int i = -8; i <= 8; i++) {
144 SkDebugf(" int_mirror(%2d, 3) = %d\n", i, int_mirror(i, 3));
145 }
146 }
147 #endif
148
choose_int_tile_proc(unsigned tm)149 static SkBitmapProcState::IntTileProc choose_int_tile_proc(unsigned tm) {
150 if (SkShader::kClamp_TileMode == tm)
151 return int_clamp;
152 if (SkShader::kRepeat_TileMode == tm)
153 return int_repeat;
154 SkASSERT(SkShader::kMirror_TileMode == tm);
155 return int_mirror;
156 }
157
158 //////////////////////////////////////////////////////////////////////////////
159
decal_nofilter_scale(uint32_t dst[],SkFixed fx,SkFixed dx,int count)160 void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
161 {
162 int i;
163
164 #if defined(__ARM_HAVE_NEON)
165 if (count >= 8) {
166 /* SkFixed is 16.16 fixed point */
167 SkFixed dx2 = dx+dx;
168 SkFixed dx4 = dx2+dx2;
169 SkFixed dx8 = dx4+dx4;
170
171 /* now build fx/fx+dx/fx+2dx/fx+3dx */
172 SkFixed fx1, fx2, fx3;
173 int32x2_t lower, upper;
174 int32x4_t lbase, hbase;
175 uint16_t *dst16 = (uint16_t *)dst;
176
177 fx1 = fx+dx;
178 fx2 = fx1+dx;
179 fx3 = fx2+dx;
180
181 /* avoid an 'lbase unitialized' warning */
182 lbase = vdupq_n_s32(fx);
183 lbase = vsetq_lane_s32(fx1, lbase, 1);
184 lbase = vsetq_lane_s32(fx2, lbase, 2);
185 lbase = vsetq_lane_s32(fx3, lbase, 3);
186 hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
187
188 /* take upper 16 of each, store, and bump everything */
189 do {
190 int32x4_t lout, hout;
191 uint16x8_t hi16;
192
193 lout = lbase;
194 hout = hbase;
195 /* gets hi's of all louts then hi's of all houts */
196 asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
197 hi16 = vreinterpretq_u16_s32(hout);
198 vst1q_u16(dst16, hi16);
199
200 /* on to the next */
201 lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
202 hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
203 dst16 += 8;
204 count -= 8;
205 fx += dx8;
206 } while (count >= 8);
207 dst = (uint32_t *) dst16;
208 }
209 #else
210 for (i = (count >> 2); i > 0; --i)
211 {
212 *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
213 fx += dx+dx;
214 *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
215 fx += dx+dx;
216 }
217 count &= 3;
218 #endif
219
220 uint16_t* xx = (uint16_t*)dst;
221 for (i = count; i > 0; --i) {
222 *xx++ = SkToU16(fx >> 16); fx += dx;
223 }
224 }
225
decal_filter_scale(uint32_t dst[],SkFixed fx,SkFixed dx,int count)226 void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
227 {
228
229 #if defined(__ARM_HAVE_NEON)
230 if (count >= 8) {
231 int32x4_t wide_fx;
232 int32x4_t wide_fx2;
233 int32x4_t wide_dx8 = vdupq_n_s32(dx*8);
234
235 wide_fx = vdupq_n_s32(fx);
236 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
237 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
238 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
239
240 wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx));
241
242 while (count >= 8) {
243 int32x4_t wide_out;
244 int32x4_t wide_out2;
245
246 wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
247 wide_out = vorrq_s32(wide_out,
248 vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));
249
250 wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
251 wide_out2 = vorrq_s32(wide_out2,
252 vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));
253
254 vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
255 vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
256
257 dst += 8;
258 fx += dx*8;
259 wide_fx = vaddq_s32(wide_fx, wide_dx8);
260 wide_fx2 = vaddq_s32(wide_fx2, wide_dx8);
261 count -= 8;
262 }
263 }
264 #endif
265
266 if (count & 1)
267 {
268 SkASSERT((fx >> (16 + 14)) == 0);
269 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
270 fx += dx;
271 }
272 while ((count -= 2) >= 0)
273 {
274 SkASSERT((fx >> (16 + 14)) == 0);
275 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
276 fx += dx;
277
278 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
279 fx += dx;
280 }
281 }
282
283 ///////////////////////////////////////////////////////////////////////////////
284 // stores the same as SCALE, but is cheaper to compute. Also since there is no
285 // scale, we don't need/have a FILTER version
286
fill_sequential(uint16_t xptr[],int start,int count)287 static void fill_sequential(uint16_t xptr[], int start, int count) {
288 #if 1
289 if (reinterpret_cast<intptr_t>(xptr) & 0x2) {
290 *xptr++ = start++;
291 count -= 1;
292 }
293 if (count > 3) {
294 uint32_t* xxptr = reinterpret_cast<uint32_t*>(xptr);
295 uint32_t pattern0 = PACK_TWO_SHORTS(start + 0, start + 1);
296 uint32_t pattern1 = PACK_TWO_SHORTS(start + 2, start + 3);
297 start += count & ~3;
298 int qcount = count >> 2;
299 do {
300 *xxptr++ = pattern0;
301 pattern0 += 0x40004;
302 *xxptr++ = pattern1;
303 pattern1 += 0x40004;
304 } while (--qcount != 0);
305 xptr = reinterpret_cast<uint16_t*>(xxptr);
306 count &= 3;
307 }
308 while (--count >= 0) {
309 *xptr++ = start++;
310 }
311 #else
312 for (int i = 0; i < count; i++) {
313 *xptr++ = start++;
314 }
315 #endif
316 }
317
nofilter_trans_preamble(const SkBitmapProcState & s,uint32_t ** xy,int x,int y)318 static int nofilter_trans_preamble(const SkBitmapProcState& s, uint32_t** xy,
319 int x, int y) {
320 SkPoint pt;
321 s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
322 SkIntToScalar(y) + SK_ScalarHalf, &pt);
323 **xy = s.fIntTileProcY(SkScalarToFixed(pt.fY) >> 16,
324 s.fBitmap->height());
325 *xy += 1; // bump the ptr
326 // return our starting X position
327 return SkScalarToFixed(pt.fX) >> 16;
328 }
329
clampx_nofilter_trans(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)330 static void clampx_nofilter_trans(const SkBitmapProcState& s,
331 uint32_t xy[], int count, int x, int y) {
332 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
333
334 int xpos = nofilter_trans_preamble(s, &xy, x, y);
335 const int width = s.fBitmap->width();
336 if (1 == width) {
337 // all of the following X values must be 0
338 memset(xy, 0, count * sizeof(uint16_t));
339 return;
340 }
341
342 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
343 int n;
344
345 // fill before 0 as needed
346 if (xpos < 0) {
347 n = -xpos;
348 if (n > count) {
349 n = count;
350 }
351 memset(xptr, 0, n * sizeof(uint16_t));
352 count -= n;
353 if (0 == count) {
354 return;
355 }
356 xptr += n;
357 xpos = 0;
358 }
359
360 // fill in 0..width-1 if needed
361 if (xpos < width) {
362 n = width - xpos;
363 if (n > count) {
364 n = count;
365 }
366 fill_sequential(xptr, xpos, n);
367 count -= n;
368 if (0 == count) {
369 return;
370 }
371 xptr += n;
372 }
373
374 // fill the remaining with the max value
375 sk_memset16(xptr, width - 1, count);
376 }
377
repeatx_nofilter_trans(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)378 static void repeatx_nofilter_trans(const SkBitmapProcState& s,
379 uint32_t xy[], int count, int x, int y) {
380 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
381
382 int xpos = nofilter_trans_preamble(s, &xy, x, y);
383 const int width = s.fBitmap->width();
384 if (1 == width) {
385 // all of the following X values must be 0
386 memset(xy, 0, count * sizeof(uint16_t));
387 return;
388 }
389
390 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
391 int start = sk_int_mod(xpos, width);
392 int n = width - start;
393 if (n > count) {
394 n = count;
395 }
396 fill_sequential(xptr, start, n);
397 xptr += n;
398 count -= n;
399
400 while (count >= width) {
401 fill_sequential(xptr, 0, width);
402 xptr += width;
403 count -= width;
404 }
405
406 if (count > 0) {
407 fill_sequential(xptr, 0, count);
408 }
409 }
410
fill_backwards(uint16_t xptr[],int pos,int count)411 static void fill_backwards(uint16_t xptr[], int pos, int count) {
412 for (int i = 0; i < count; i++) {
413 SkASSERT(pos >= 0);
414 xptr[i] = pos--;
415 }
416 }
417
mirrorx_nofilter_trans(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)418 static void mirrorx_nofilter_trans(const SkBitmapProcState& s,
419 uint32_t xy[], int count, int x, int y) {
420 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
421
422 int xpos = nofilter_trans_preamble(s, &xy, x, y);
423 const int width = s.fBitmap->width();
424 if (1 == width) {
425 // all of the following X values must be 0
426 memset(xy, 0, count * sizeof(uint16_t));
427 return;
428 }
429
430 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
431 // need to know our start, and our initial phase (forward or backward)
432 bool forward;
433 int n;
434 int start = sk_int_mod(xpos, 2 * width);
435 if (start >= width) {
436 start = width + ~(start - width);
437 forward = false;
438 n = start + 1; // [start .. 0]
439 } else {
440 forward = true;
441 n = width - start; // [start .. width)
442 }
443 if (n > count) {
444 n = count;
445 }
446 if (forward) {
447 fill_sequential(xptr, start, n);
448 } else {
449 fill_backwards(xptr, start, n);
450 }
451 forward = !forward;
452 xptr += n;
453 count -= n;
454
455 while (count >= width) {
456 if (forward) {
457 fill_sequential(xptr, 0, width);
458 } else {
459 fill_backwards(xptr, width - 1, width);
460 }
461 forward = !forward;
462 xptr += width;
463 count -= width;
464 }
465
466 if (count > 0) {
467 if (forward) {
468 fill_sequential(xptr, 0, count);
469 } else {
470 fill_backwards(xptr, width - 1, count);
471 }
472 }
473 }
474
475 ///////////////////////////////////////////////////////////////////////////////
476
477 SkBitmapProcState::MatrixProc
chooseMatrixProc(bool trivial_matrix)478 SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) {
479 // test_int_tileprocs();
480 // check for our special case when there is no scale/affine/perspective
481 if (trivial_matrix) {
482 SkASSERT(!fDoFilter);
483 fIntTileProcY = choose_int_tile_proc(fTileModeY);
484 switch (fTileModeX) {
485 case SkShader::kClamp_TileMode:
486 return clampx_nofilter_trans;
487 case SkShader::kRepeat_TileMode:
488 return repeatx_nofilter_trans;
489 case SkShader::kMirror_TileMode:
490 return mirrorx_nofilter_trans;
491 }
492 }
493
494 int index = 0;
495 if (fDoFilter) {
496 index = 1;
497 }
498 if (fInvType & SkMatrix::kPerspective_Mask) {
499 index += 4;
500 } else if (fInvType & SkMatrix::kAffine_Mask) {
501 index += 2;
502 }
503
504 if (SkShader::kClamp_TileMode == fTileModeX &&
505 SkShader::kClamp_TileMode == fTileModeY)
506 {
507 // clamp gets special version of filterOne
508 fFilterOneX = SK_Fixed1;
509 fFilterOneY = SK_Fixed1;
510 return ClampX_ClampY_Procs[index];
511 }
512
513 // all remaining procs use this form for filterOne
514 fFilterOneX = SK_Fixed1 / fBitmap->width();
515 fFilterOneY = SK_Fixed1 / fBitmap->height();
516
517 if (SkShader::kRepeat_TileMode == fTileModeX &&
518 SkShader::kRepeat_TileMode == fTileModeY)
519 {
520 return RepeatX_RepeatY_Procs[index];
521 }
522
523 fTileProcX = choose_tile_proc(fTileModeX);
524 fTileProcY = choose_tile_proc(fTileModeY);
525 return GeneralXY_Procs[index];
526 }
527
528