1 /* NEON optimized code (C) COPYRIGHT 2009 Motorola */
2
3 #include "SkBitmapProcState.h"
4 #include "SkPerspIter.h"
5 #include "SkShader.h"
6 #include "SkUtils.h"
7
8 /* returns 0...(n-1) given any x (positive or negative).
9
10 As an example, if n (which is always positive) is 5...
11
12 x: -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8
13 returns: 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3
14 */
sk_int_mod(int x,int n)15 static inline int sk_int_mod(int x, int n) {
16 SkASSERT(n > 0);
17 if ((unsigned)x >= (unsigned)n) {
18 if (x < 0) {
19 x = n + ~(~x % n);
20 } else {
21 x = x % n;
22 }
23 }
24 return x;
25 }
26
27 void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
28 void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
29
30 #define MAKENAME(suffix) ClampX_ClampY ## suffix
31 #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max)
32 #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max)
33 #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
34 #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
35 #define CHECK_FOR_DECAL
36 #if defined(__ARM_HAVE_NEON)
37 #include "SkBitmapProcState_matrix_clamp.h"
38 #else
39 #include "SkBitmapProcState_matrix.h"
40 #endif
41
42 #define MAKENAME(suffix) RepeatX_RepeatY ## suffix
43 #define TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16)
44 #define TILEY_PROCF(fy, max) (((fy) & 0xFFFF) * ((max) + 1) >> 16)
45 #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
46 #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
47 #if defined(__ARM_HAVE_NEON)
48 #include "SkBitmapProcState_matrix_repeat.h"
49 #else
50 #include "SkBitmapProcState_matrix.h"
51 #endif
52
53 #define MAKENAME(suffix) GeneralXY ## suffix
54 #define PREAMBLE(state) SkBitmapProcState::FixedTileProc tileProcX = (state).fTileProcX; \
55 SkBitmapProcState::FixedTileProc tileProcY = (state).fTileProcY
56 #define PREAMBLE_PARAM_X , SkBitmapProcState::FixedTileProc tileProcX
57 #define PREAMBLE_PARAM_Y , SkBitmapProcState::FixedTileProc tileProcY
58 #define PREAMBLE_ARG_X , tileProcX
59 #define PREAMBLE_ARG_Y , tileProcY
60 #define TILEX_PROCF(fx, max) (tileProcX(fx) * ((max) + 1) >> 16)
61 #define TILEY_PROCF(fy, max) (tileProcY(fy) * ((max) + 1) >> 16)
62 #define TILEX_LOW_BITS(fx, max) ((tileProcX(fx) * ((max) + 1) >> 12) & 0xF)
63 #define TILEY_LOW_BITS(fy, max) ((tileProcY(fy) * ((max) + 1) >> 12) & 0xF)
64 #include "SkBitmapProcState_matrix.h"
65
fixed_clamp(SkFixed x)66 static inline U16CPU fixed_clamp(SkFixed x)
67 {
68 #ifdef SK_CPU_HAS_CONDITIONAL_INSTR
69 if (x >> 16)
70 x = 0xFFFF;
71 if (x < 0)
72 x = 0;
73 #else
74 if (x >> 16)
75 {
76 if (x < 0)
77 x = 0;
78 else
79 x = 0xFFFF;
80 }
81 #endif
82 return x;
83 }
84
fixed_repeat(SkFixed x)85 static inline U16CPU fixed_repeat(SkFixed x)
86 {
87 return x & 0xFFFF;
88 }
89
fixed_mirror(SkFixed x)90 static inline U16CPU fixed_mirror(SkFixed x)
91 {
92 SkFixed s = x << 15 >> 31;
93 // s is FFFFFFFF if we're on an odd interval, or 0 if an even interval
94 return (x ^ s) & 0xFFFF;
95 }
96
choose_tile_proc(unsigned m)97 static SkBitmapProcState::FixedTileProc choose_tile_proc(unsigned m)
98 {
99 if (SkShader::kClamp_TileMode == m)
100 return fixed_clamp;
101 if (SkShader::kRepeat_TileMode == m)
102 return fixed_repeat;
103 SkASSERT(SkShader::kMirror_TileMode == m);
104 return fixed_mirror;
105 }
106
int_clamp(int x,int n)107 static inline U16CPU int_clamp(int x, int n) {
108 #ifdef SK_CPU_HAS_CONDITIONAL_INSTR
109 if (x >= n)
110 x = n - 1;
111 if (x < 0)
112 x = 0;
113 #else
114 if ((unsigned)x >= (unsigned)n) {
115 if (x < 0) {
116 x = 0;
117 } else {
118 x = n - 1;
119 }
120 }
121 #endif
122 return x;
123 }
124
int_repeat(int x,int n)125 static inline U16CPU int_repeat(int x, int n) {
126 return sk_int_mod(x, n);
127 }
128
int_mirror(int x,int n)129 static inline U16CPU int_mirror(int x, int n) {
130 x = sk_int_mod(x, 2 * n);
131 if (x >= n) {
132 x = n + ~(x - n);
133 }
134 return x;
135 }
136
137 #if 0
138 static void test_int_tileprocs() {
139 for (int i = -8; i <= 8; i++) {
140 SkDebugf(" int_mirror(%2d, 3) = %d\n", i, int_mirror(i, 3));
141 }
142 }
143 #endif
144
choose_int_tile_proc(unsigned tm)145 static SkBitmapProcState::IntTileProc choose_int_tile_proc(unsigned tm) {
146 if (SkShader::kClamp_TileMode == tm)
147 return int_clamp;
148 if (SkShader::kRepeat_TileMode == tm)
149 return int_repeat;
150 SkASSERT(SkShader::kMirror_TileMode == tm);
151 return int_mirror;
152 }
153
154 //////////////////////////////////////////////////////////////////////////////
155
decal_nofilter_scale(uint32_t dst[],SkFixed fx,SkFixed dx,int count)156 void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
157 {
158 int i;
159
160 #if defined(__ARM_HAVE_NEON)
161 if (count >= 8) {
162 /* SkFixed is 16.16 fixed point */
163 SkFixed dx2 = dx+dx;
164 SkFixed dx4 = dx2+dx2;
165 SkFixed dx8 = dx4+dx4;
166
167 /* now build fx/fx+dx/fx+2dx/fx+3dx */
168 SkFixed fx1, fx2, fx3;
169 int32x2_t lower, upper;
170 int32x4_t lbase, hbase;
171 uint16_t *dst16 = (uint16_t *)dst;
172
173 fx1 = fx+dx;
174 fx2 = fx1+dx;
175 fx3 = fx2+dx;
176
177 /* avoid an 'lbase unitialized' warning */
178 lbase = vdupq_n_s32(fx);
179 lbase = vsetq_lane_s32(fx1, lbase, 1);
180 lbase = vsetq_lane_s32(fx2, lbase, 2);
181 lbase = vsetq_lane_s32(fx3, lbase, 3);
182 hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
183
184 /* take upper 16 of each, store, and bump everything */
185 do {
186 int32x4_t lout, hout;
187 uint16x8_t hi16;
188
189 lout = lbase;
190 hout = hbase;
191 /* gets hi's of all louts then hi's of all houts */
192 asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
193 hi16 = vreinterpretq_u16_s32(hout);
194 vst1q_u16(dst16, hi16);
195
196 /* on to the next */
197 lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
198 hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
199 dst16 += 8;
200 count -= 8;
201 fx += dx8;
202 } while (count >= 8);
203 dst = (uint32_t *) dst16;
204 }
205 #else
206 for (i = (count >> 2); i > 0; --i)
207 {
208 *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
209 fx += dx+dx;
210 *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
211 fx += dx+dx;
212 }
213 count &= 3;
214 #endif
215
216 uint16_t* xx = (uint16_t*)dst;
217 for (i = count; i > 0; --i) {
218 *xx++ = SkToU16(fx >> 16); fx += dx;
219 }
220 }
221
decal_filter_scale(uint32_t dst[],SkFixed fx,SkFixed dx,int count)222 void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
223 {
224
225 #if defined(__ARM_HAVE_NEON)
226 if (count >= 8) {
227 int32x4_t wide_fx;
228 int32x4_t wide_fx2;
229 int32x4_t wide_dx8 = vdupq_n_s32(dx*8);
230
231 wide_fx = vdupq_n_s32(fx);
232 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
233 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
234 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
235
236 wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx));
237
238 while (count >= 8) {
239 int32x4_t wide_out;
240 int32x4_t wide_out2;
241
242 wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
243 wide_out = vorrq_s32(wide_out,
244 vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));
245
246 wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
247 wide_out2 = vorrq_s32(wide_out2,
248 vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));
249
250 vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
251 vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
252
253 dst += 8;
254 fx += dx*8;
255 wide_fx = vaddq_s32(wide_fx, wide_dx8);
256 wide_fx2 = vaddq_s32(wide_fx2, wide_dx8);
257 count -= 8;
258 }
259 }
260 #endif
261
262 if (count & 1)
263 {
264 SkASSERT((fx >> (16 + 14)) == 0);
265 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
266 fx += dx;
267 }
268 while ((count -= 2) >= 0)
269 {
270 SkASSERT((fx >> (16 + 14)) == 0);
271 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
272 fx += dx;
273
274 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
275 fx += dx;
276 }
277 }
278
279 ///////////////////////////////////////////////////////////////////////////////
280 // stores the same as SCALE, but is cheaper to compute. Also since there is no
281 // scale, we don't need/have a FILTER version
282
fill_sequential(uint16_t xptr[],int start,int count)283 static void fill_sequential(uint16_t xptr[], int start, int count) {
284 #if 1
285 if (reinterpret_cast<intptr_t>(xptr) & 0x2) {
286 *xptr++ = start++;
287 count -= 1;
288 }
289 if (count > 3) {
290 uint32_t* xxptr = reinterpret_cast<uint32_t*>(xptr);
291 uint32_t pattern0 = PACK_TWO_SHORTS(start + 0, start + 1);
292 uint32_t pattern1 = PACK_TWO_SHORTS(start + 2, start + 3);
293 start += count & ~3;
294 int qcount = count >> 2;
295 do {
296 *xxptr++ = pattern0;
297 pattern0 += 0x40004;
298 *xxptr++ = pattern1;
299 pattern1 += 0x40004;
300 } while (--qcount != 0);
301 xptr = reinterpret_cast<uint16_t*>(xxptr);
302 count &= 3;
303 }
304 while (--count >= 0) {
305 *xptr++ = start++;
306 }
307 #else
308 for (int i = 0; i < count; i++) {
309 *xptr++ = start++;
310 }
311 #endif
312 }
313
nofilter_trans_preamble(const SkBitmapProcState & s,uint32_t ** xy,int x,int y)314 static int nofilter_trans_preamble(const SkBitmapProcState& s, uint32_t** xy,
315 int x, int y) {
316 SkPoint pt;
317 s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
318 SkIntToScalar(y) + SK_ScalarHalf, &pt);
319 **xy = s.fIntTileProcY(SkScalarToFixed(pt.fY) >> 16,
320 s.fBitmap->height());
321 *xy += 1; // bump the ptr
322 // return our starting X position
323 return SkScalarToFixed(pt.fX) >> 16;
324 }
325
clampx_nofilter_trans(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)326 static void clampx_nofilter_trans(const SkBitmapProcState& s,
327 uint32_t xy[], int count, int x, int y) {
328 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
329
330 int xpos = nofilter_trans_preamble(s, &xy, x, y);
331 const int width = s.fBitmap->width();
332 if (1 == width) {
333 // all of the following X values must be 0
334 memset(xy, 0, count * sizeof(uint16_t));
335 return;
336 }
337
338 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
339 int n;
340
341 // fill before 0 as needed
342 if (xpos < 0) {
343 n = -xpos;
344 if (n > count) {
345 n = count;
346 }
347 memset(xptr, 0, n * sizeof(uint16_t));
348 count -= n;
349 if (0 == count) {
350 return;
351 }
352 xptr += n;
353 xpos = 0;
354 }
355
356 // fill in 0..width-1 if needed
357 if (xpos < width) {
358 n = width - xpos;
359 if (n > count) {
360 n = count;
361 }
362 fill_sequential(xptr, xpos, n);
363 count -= n;
364 if (0 == count) {
365 return;
366 }
367 xptr += n;
368 }
369
370 // fill the remaining with the max value
371 sk_memset16(xptr, width - 1, count * sizeof(uint16_t));
372 }
373
repeatx_nofilter_trans(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)374 static void repeatx_nofilter_trans(const SkBitmapProcState& s,
375 uint32_t xy[], int count, int x, int y) {
376 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
377
378 int xpos = nofilter_trans_preamble(s, &xy, x, y);
379 const int width = s.fBitmap->width();
380 if (1 == width) {
381 // all of the following X values must be 0
382 memset(xy, 0, count * sizeof(uint16_t));
383 return;
384 }
385
386 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
387 int start = sk_int_mod(xpos, width);
388 int n = width - start;
389 if (n > count) {
390 n = count;
391 }
392 fill_sequential(xptr, start, n);
393 xptr += n;
394 count -= n;
395
396 while (count >= width) {
397 fill_sequential(xptr, 0, width);
398 xptr += width;
399 count -= width;
400 }
401
402 if (count > 0) {
403 fill_sequential(xptr, 0, count);
404 }
405 }
406
fill_backwards(uint16_t xptr[],int pos,int count)407 static void fill_backwards(uint16_t xptr[], int pos, int count) {
408 for (int i = 0; i < count; i++) {
409 SkASSERT(pos >= 0);
410 xptr[i] = pos--;
411 }
412 }
413
mirrorx_nofilter_trans(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)414 static void mirrorx_nofilter_trans(const SkBitmapProcState& s,
415 uint32_t xy[], int count, int x, int y) {
416 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
417
418 int xpos = nofilter_trans_preamble(s, &xy, x, y);
419 const int width = s.fBitmap->width();
420 if (1 == width) {
421 // all of the following X values must be 0
422 memset(xy, 0, count * sizeof(uint16_t));
423 return;
424 }
425
426 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
427 // need to know our start, and our initial phase (forward or backward)
428 bool forward;
429 int n;
430 int start = sk_int_mod(xpos, 2 * width);
431 if (start >= width) {
432 start = width + ~(start - width);
433 forward = false;
434 n = start + 1; // [start .. 0]
435 } else {
436 forward = true;
437 n = width - start; // [start .. width)
438 }
439 if (n > count) {
440 n = count;
441 }
442 if (forward) {
443 fill_sequential(xptr, start, n);
444 } else {
445 fill_backwards(xptr, start, n);
446 }
447 forward = !forward;
448 xptr += n;
449 count -= n;
450
451 while (count >= width) {
452 if (forward) {
453 fill_sequential(xptr, 0, width);
454 } else {
455 fill_backwards(xptr, width - 1, width);
456 }
457 forward = !forward;
458 xptr += width;
459 count -= width;
460 }
461
462 if (count > 0) {
463 if (forward) {
464 fill_sequential(xptr, 0, count);
465 } else {
466 fill_backwards(xptr, width - 1, count);
467 }
468 }
469 }
470
471 ///////////////////////////////////////////////////////////////////////////////
472
473 SkBitmapProcState::MatrixProc
chooseMatrixProc(bool trivial_matrix)474 SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) {
475 // test_int_tileprocs();
476 // check for our special case when there is no scale/affine/perspective
477 if (trivial_matrix) {
478 SkASSERT(!fDoFilter);
479 fIntTileProcY = choose_int_tile_proc(fTileModeY);
480 switch (fTileModeX) {
481 case SkShader::kClamp_TileMode:
482 return clampx_nofilter_trans;
483 case SkShader::kRepeat_TileMode:
484 return repeatx_nofilter_trans;
485 case SkShader::kMirror_TileMode:
486 return mirrorx_nofilter_trans;
487 }
488 }
489
490 int index = 0;
491 if (fDoFilter) {
492 index = 1;
493 }
494 if (fInvType & SkMatrix::kPerspective_Mask) {
495 index += 4;
496 } else if (fInvType & SkMatrix::kAffine_Mask) {
497 index += 2;
498 }
499
500 if (SkShader::kClamp_TileMode == fTileModeX &&
501 SkShader::kClamp_TileMode == fTileModeY)
502 {
503 // clamp gets special version of filterOne
504 fFilterOneX = SK_Fixed1;
505 fFilterOneY = SK_Fixed1;
506 return ClampX_ClampY_Procs[index];
507 }
508
509 // all remaining procs use this form for filterOne
510 fFilterOneX = SK_Fixed1 / fBitmap->width();
511 fFilterOneY = SK_Fixed1 / fBitmap->height();
512
513 if (SkShader::kRepeat_TileMode == fTileModeX &&
514 SkShader::kRepeat_TileMode == fTileModeY)
515 {
516 return RepeatX_RepeatY_Procs[index];
517 }
518
519 fTileProcX = choose_tile_proc(fTileModeX);
520 fTileProcY = choose_tile_proc(fTileModeY);
521 return GeneralXY_Procs[index];
522 }
523
524