1 /*
2 * Copyright 2014 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include <arm_neon.h>
9
10 #define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale)
11 #define SCALE_FILTER_NAME MAKENAME(_filter_scale)
12 #define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine)
13 #define AFFINE_FILTER_NAME MAKENAME(_filter_affine)
14 #define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp)
15 #define PERSP_FILTER_NAME MAKENAME(_filter_persp)
16
17 #define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x)
18 #define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y)
19 #define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4)
20 #define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4)
21
22 #ifndef PREAMBLE
23 #define PREAMBLE(state)
24 #define PREAMBLE_PARAM_X
25 #define PREAMBLE_PARAM_Y
26 #define PREAMBLE_ARG_X
27 #define PREAMBLE_ARG_Y
28 #endif
29
SCALE_NOFILTER_NAME(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)30 static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
31 uint32_t xy[], int count, int x, int y) {
32 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
33 SkMatrix::kScale_Mask)) == 0);
34
35 PREAMBLE(s);
36
37 // we store y, x, x, x, x, x
38 const unsigned maxX = s.fPixmap.width() - 1;
39 SkFractionalInt fx;
40 {
41 const SkBitmapProcStateAutoMapper mapper(s, x, y);
42 const unsigned maxY = s.fPixmap.height() - 1;
43 *xy++ = TILEY_PROCF(mapper.fixedY(), maxY);
44 fx = mapper.fractionalIntX();
45 }
46
47 if (0 == maxX) {
48 // all of the following X values must be 0
49 memset(xy, 0, count * sizeof(uint16_t));
50 return;
51 }
52
53 const SkFractionalInt dx = s.fInvSxFractionalInt;
54
55 #ifdef CHECK_FOR_DECAL
56 // test if we don't need to apply the tile proc
57 if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
58 decal_nofilter_scale_neon(xy, SkFractionalIntToFixed(fx),
59 SkFractionalIntToFixed(dx), count);
60 return;
61 }
62 #endif
63
64 if (count >= 8) {
65 SkFractionalInt dx2 = dx+dx;
66 SkFractionalInt dx4 = dx2+dx2;
67 SkFractionalInt dx8 = dx4+dx4;
68
69 // now build fx/fx+dx/fx+2dx/fx+3dx
70 SkFractionalInt fx1, fx2, fx3;
71 int32x4_t lbase, hbase;
72 int16_t *dst16 = (int16_t *)xy;
73
74 fx1 = fx+dx;
75 fx2 = fx1+dx;
76 fx3 = fx2+dx;
77
78 lbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
79 lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1);
80 lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2);
81 lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3);
82 hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
83
84 // store & bump
85 while (count >= 8) {
86
87 int16x8_t fx8;
88
89 fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX);
90
91 vst1q_s16(dst16, fx8);
92
93 // but preserving base & on to the next
94 lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
95 hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
96 dst16 += 8;
97 count -= 8;
98 fx += dx8;
99 };
100 xy = (uint32_t *) dst16;
101 }
102
103 uint16_t* xx = (uint16_t*)xy;
104 for (int i = count; i > 0; --i) {
105 *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
106 fx += dx;
107 }
108 }
109
AFFINE_NOFILTER_NAME(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)110 static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
111 uint32_t xy[], int count, int x, int y) {
112 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
113 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
114 SkMatrix::kScale_Mask |
115 SkMatrix::kAffine_Mask)) == 0);
116
117 PREAMBLE(s);
118 const SkBitmapProcStateAutoMapper mapper(s, x, y);
119
120 SkFractionalInt fx = mapper.fractionalIntX();
121 SkFractionalInt fy = mapper.fractionalIntY();
122 SkFractionalInt dx = s.fInvSxFractionalInt;
123 SkFractionalInt dy = s.fInvKyFractionalInt;
124 int maxX = s.fPixmap.width() - 1;
125 int maxY = s.fPixmap.height() - 1;
126
127 if (count >= 8) {
128 SkFractionalInt dx4 = dx * 4;
129 SkFractionalInt dy4 = dy * 4;
130 SkFractionalInt dx8 = dx * 8;
131 SkFractionalInt dy8 = dy * 8;
132
133 int32x4_t xbase, ybase;
134 int32x4_t x2base, y2base;
135 int16_t *dst16 = (int16_t *) xy;
136
137 // now build fx, fx+dx, fx+2dx, fx+3dx
138 xbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
139 xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1);
140 xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2);
141 xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3);
142
143 // same for fy
144 ybase = vdupq_n_s32(SkFractionalIntToFixed(fy));
145 ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1);
146 ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2);
147 ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3);
148
149 x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
150 y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4)));
151
152 // store & bump
153 do {
154 int16x8x2_t hi16;
155
156 hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX);
157 hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY);
158
159 vst2q_s16(dst16, hi16);
160
161 // moving base and on to the next
162 xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
163 ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
164 x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
165 y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
166
167 dst16 += 16; // 8x32 aka 16x16
168 count -= 8;
169 fx += dx8;
170 fy += dy8;
171 } while (count >= 8);
172 xy = (uint32_t *) dst16;
173 }
174
175 for (int i = count; i > 0; --i) {
176 *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) |
177 TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
178 fx += dx; fy += dy;
179 }
180 }
181
PERSP_NOFILTER_NAME(const SkBitmapProcState & s,uint32_t * SK_RESTRICT xy,int count,int x,int y)182 static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
183 uint32_t* SK_RESTRICT xy,
184 int count, int x, int y) {
185 SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
186
187 PREAMBLE(s);
188 // max{X,Y} are int here, but later shown/assumed to fit in 16 bits
189 int maxX = s.fPixmap.width() - 1;
190 int maxY = s.fPixmap.height() - 1;
191
192 SkPerspIter iter(s.fInvMatrix,
193 SkIntToScalar(x) + SK_ScalarHalf,
194 SkIntToScalar(y) + SK_ScalarHalf, count);
195
196 while ((count = iter.next()) != 0) {
197 const SkFixed* SK_RESTRICT srcXY = iter.getXY();
198
199 if (count >= 8) {
200 int32_t *mysrc = (int32_t *) srcXY;
201 int16_t *mydst = (int16_t *) xy;
202 do {
203 int16x8x2_t hi16;
204 int32x4x2_t xy1, xy2;
205
206 xy1 = vld2q_s32(mysrc);
207 xy2 = vld2q_s32(mysrc+8);
208
209 hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX);
210 hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY);
211
212 vst2q_s16(mydst, hi16);
213
214 count -= 8; // 8 iterations
215 mysrc += 16; // 16 longs
216 mydst += 16; // 16 shorts, aka 8 longs
217 } while (count >= 8);
218 // get xy and srcXY fixed up
219 srcXY = (const SkFixed *) mysrc;
220 xy = (uint32_t *) mydst;
221 }
222
223 while (--count >= 0) {
224 *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
225 TILEX_PROCF(srcXY[0], maxX);
226 srcXY += 2;
227 }
228 }
229 }
230
PACK_FILTER_Y_NAME(SkFixed f,unsigned max,SkFixed one PREAMBLE_PARAM_Y)231 static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
232 SkFixed one PREAMBLE_PARAM_Y) {
233 unsigned i = TILEY_PROCF(f, max);
234 i = (i << 4) | TILEY_LOW_BITS(f, max);
235 return (i << 14) | (TILEY_PROCF((f + one), max));
236 }
237
PACK_FILTER_X_NAME(SkFixed f,unsigned max,SkFixed one PREAMBLE_PARAM_X)238 static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
239 SkFixed one PREAMBLE_PARAM_X) {
240 unsigned i = TILEX_PROCF(f, max);
241 i = (i << 4) | TILEX_LOW_BITS(f, max);
242 return (i << 14) | (TILEX_PROCF((f + one), max));
243 }
244
PACK_FILTER_X4_NAME(int32x4_t f,unsigned max,SkFixed one PREAMBLE_PARAM_X)245 static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max,
246 SkFixed one PREAMBLE_PARAM_X) {
247 int32x4_t ret, res, wide_one;
248
249 // Prepare constants
250 wide_one = vdupq_n_s32(one);
251
252 // Step 1
253 res = TILEX_PROCF_NEON4(f, max);
254
255 // Step 2
256 ret = TILEX_LOW_BITS_NEON4(f, max);
257 ret = vsliq_n_s32(ret, res, 4);
258
259 // Step 3
260 res = TILEX_PROCF_NEON4(f + wide_one, max);
261 ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
262
263 return ret;
264 }
265
PACK_FILTER_Y4_NAME(int32x4_t f,unsigned max,SkFixed one PREAMBLE_PARAM_X)266 static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max,
267 SkFixed one PREAMBLE_PARAM_X) {
268 int32x4_t ret, res, wide_one;
269
270 // Prepare constants
271 wide_one = vdupq_n_s32(one);
272
273 // Step 1
274 res = TILEY_PROCF_NEON4(f, max);
275
276 // Step 2
277 ret = TILEY_LOW_BITS_NEON4(f, max);
278 ret = vsliq_n_s32(ret, res, 4);
279
280 // Step 3
281 res = TILEY_PROCF_NEON4(f + wide_one, max);
282 ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
283
284 return ret;
285 }
286
SCALE_FILTER_NAME(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)287 static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
288 uint32_t xy[], int count, int x, int y) {
289 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
290 SkMatrix::kScale_Mask)) == 0);
291 SkASSERT(s.fInvKy == 0);
292
293 PREAMBLE(s);
294
295 const unsigned maxX = s.fPixmap.width() - 1;
296 const SkFixed one = s.fFilterOneX;
297 const SkFractionalInt dx = s.fInvSxFractionalInt;
298 SkFractionalInt fx;
299
300 {
301 const SkBitmapProcStateAutoMapper mapper(s, x, y);
302 const SkFixed fy = mapper.fixedY();
303 const unsigned maxY = s.fPixmap.height() - 1;
304 // compute our two Y values up front
305 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
306 // now initialize fx
307 fx = mapper.fractionalIntX();
308 }
309
310 #ifdef CHECK_FOR_DECAL
311 // test if we don't need to apply the tile proc
312 if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
313 decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx),
314 SkFractionalIntToFixed(dx), count);
315 return;
316 }
317 #endif
318 {
319
320 if (count >= 4) {
321 int32x4_t wide_fx;
322
323 wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx));
324 wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1);
325 wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2);
326 wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3);
327
328 while (count >= 4) {
329 int32x4_t res;
330
331 res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X);
332
333 vst1q_u32(xy, vreinterpretq_u32_s32(res));
334
335 wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx));
336 fx += dx+dx+dx+dx;
337 xy += 4;
338 count -= 4;
339 }
340 }
341
342 while (--count >= 0) {
343 *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X);
344 fx += dx;
345 }
346
347 }
348 }
349
AFFINE_FILTER_NAME(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)350 static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
351 uint32_t xy[], int count, int x, int y) {
352 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
353 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
354 SkMatrix::kScale_Mask |
355 SkMatrix::kAffine_Mask)) == 0);
356
357 PREAMBLE(s);
358 const SkBitmapProcStateAutoMapper mapper(s, x, y);
359
360 SkFixed oneX = s.fFilterOneX;
361 SkFixed oneY = s.fFilterOneY;
362 SkFixed fx = mapper.fixedX();
363 SkFixed fy = mapper.fixedY();
364 SkFixed dx = s.fInvSx;
365 SkFixed dy = s.fInvKy;
366 unsigned maxX = s.fPixmap.width() - 1;
367 unsigned maxY = s.fPixmap.height() - 1;
368
369 if (count >= 4) {
370 int32x4_t wide_fy, wide_fx;
371
372 wide_fx = vdupq_n_s32(fx);
373 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
374 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
375 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
376
377 wide_fy = vdupq_n_s32(fy);
378 wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1);
379 wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2);
380 wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3);
381
382 while (count >= 4) {
383 int32x4x2_t vxy;
384
385 // do the X side, then the Y side, then interleave them
386 vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y);
387 vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X);
388
389 // interleave as YXYXYXYX as part of the storing
390 vst2q_s32((int32_t*)xy, vxy);
391
392 // prepare next iteration
393 wide_fx += vdupq_n_s32(dx+dx+dx+dx);
394 fx += dx + dx + dx + dx;
395 wide_fy += vdupq_n_s32(dy+dy+dy+dy);
396 fy += dy+dy+dy+dy;
397 xy += 8; // 4 x's, 4 y's
398 count -= 4;
399 }
400 }
401
402 while (--count >= 0) {
403 // NB: writing Y/X
404 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
405 fy += dy;
406 *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
407 fx += dx;
408 }
409 }
410
PERSP_FILTER_NAME(const SkBitmapProcState & s,uint32_t * SK_RESTRICT xy,int count,int x,int y)411 static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
412 uint32_t* SK_RESTRICT xy, int count,
413 int x, int y) {
414 SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
415
416 PREAMBLE(s);
417 unsigned maxX = s.fPixmap.width() - 1;
418 unsigned maxY = s.fPixmap.height() - 1;
419 SkFixed oneX = s.fFilterOneX;
420 SkFixed oneY = s.fFilterOneY;
421
422 SkPerspIter iter(s.fInvMatrix,
423 SkIntToScalar(x) + SK_ScalarHalf,
424 SkIntToScalar(y) + SK_ScalarHalf, count);
425
426 while ((count = iter.next()) != 0) {
427 const SkFixed* SK_RESTRICT srcXY = iter.getXY();
428
429 while (count >= 4) {
430 int32x4_t wide_x, wide_y;
431 int32x4x2_t vxy, vresyx;
432
433 // load src: x-y-x-y-x-y-x-y
434 vxy = vld2q_s32(srcXY);
435
436 // do the X side, then the Y side, then interleave them
437 wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1));
438 wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1));
439
440 vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y);
441 vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X);
442
443 // store interleaved as y-x-y-x-y-x-y-x (NB != read order)
444 vst2q_s32((int32_t*)xy, vresyx);
445
446 // on to the next iteration
447 srcXY += 2*4;
448 count -= 4;
449 xy += 2*4;
450 }
451
452 while (--count >= 0) {
453 // NB: we read x/y, we write y/x
454 *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,
455 oneY PREAMBLE_ARG_Y);
456 *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,
457 oneX PREAMBLE_ARG_X);
458 srcXY += 2;
459 }
460 }
461 }
462
463 const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
464 SCALE_NOFILTER_NAME,
465 SCALE_FILTER_NAME,
466 AFFINE_NOFILTER_NAME,
467 AFFINE_FILTER_NAME,
468 PERSP_NOFILTER_NAME,
469 PERSP_FILTER_NAME
470 };
471
472 #undef TILEX_PROCF_NEON8
473 #undef TILEY_PROCF_NEON8
474 #undef TILEX_PROCF_NEON4
475 #undef TILEY_PROCF_NEON4
476 #undef TILEX_LOW_BITS_NEON4
477 #undef TILEY_LOW_BITS_NEON4
478
479 #undef MAKENAME
480 #undef TILEX_PROCF
481 #undef TILEY_PROCF
482 #ifdef CHECK_FOR_DECAL
483 #undef CHECK_FOR_DECAL
484 #endif
485
486 #undef SCALE_NOFILTER_NAME
487 #undef SCALE_FILTER_NAME
488 #undef AFFINE_NOFILTER_NAME
489 #undef AFFINE_FILTER_NAME
490 #undef PERSP_NOFILTER_NAME
491 #undef PERSP_FILTER_NAME
492
493 #undef PREAMBLE
494 #undef PREAMBLE_PARAM_X
495 #undef PREAMBLE_PARAM_Y
496 #undef PREAMBLE_ARG_X
497 #undef PREAMBLE_ARG_Y
498
499 #undef TILEX_LOW_BITS
500 #undef TILEY_LOW_BITS
501