1 /*
2 * Copyright 2014 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include <arm_neon.h>
9
10 #define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale)
11 #define SCALE_FILTER_NAME MAKENAME(_filter_scale)
12 #define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine)
13 #define AFFINE_FILTER_NAME MAKENAME(_filter_affine)
14
15 #define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x)
16 #define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y)
17 #define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4)
18 #define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4)
19
20 #ifndef PREAMBLE
21 #define PREAMBLE(state)
22 #define PREAMBLE_PARAM_X
23 #define PREAMBLE_PARAM_Y
24 #define PREAMBLE_ARG_X
25 #define PREAMBLE_ARG_Y
26 #endif
27
SCALE_NOFILTER_NAME(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)28 static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
29 uint32_t xy[], int count, int x, int y) {
30 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
31 SkMatrix::kScale_Mask)) == 0);
32
33 PREAMBLE(s);
34
35 // we store y, x, x, x, x, x
36 const unsigned maxX = s.fPixmap.width() - 1;
37 SkFractionalInt fx;
38 {
39 const SkBitmapProcStateAutoMapper mapper(s, x, y);
40 const unsigned maxY = s.fPixmap.height() - 1;
41 *xy++ = TILEY_PROCF(mapper.fixedY(), maxY);
42 fx = mapper.fractionalIntX();
43 }
44
45 if (0 == maxX) {
46 // all of the following X values must be 0
47 memset(xy, 0, count * sizeof(uint16_t));
48 return;
49 }
50
51 const SkFractionalInt dx = s.fInvSxFractionalInt;
52
53 #ifdef CHECK_FOR_DECAL
54 // test if we don't need to apply the tile proc
55 const SkFixed fixedFx = SkFractionalIntToFixed(fx);
56 const SkFixed fixedDx = SkFractionalIntToFixed(dx);
57 if (can_truncate_to_fixed_for_decal(fixedFx, fixedDx, count, maxX)) {
58 decal_nofilter_scale_neon(xy, fixedFx, fixedDx, count);
59 return;
60 }
61 #endif
62
63 if (count >= 8) {
64 SkFractionalInt dx2 = dx+dx;
65 SkFractionalInt dx4 = dx2+dx2;
66 SkFractionalInt dx8 = dx4+dx4;
67
68 // now build fx/fx+dx/fx+2dx/fx+3dx
69 SkFractionalInt fx1, fx2, fx3;
70 int32x4_t lbase, hbase;
71 int16_t *dst16 = (int16_t *)xy;
72
73 fx1 = fx+dx;
74 fx2 = fx1+dx;
75 fx3 = fx2+dx;
76
77 lbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
78 lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1);
79 lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2);
80 lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3);
81 hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
82
83 // store & bump
84 while (count >= 8) {
85
86 int16x8_t fx8;
87
88 fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX);
89
90 vst1q_s16(dst16, fx8);
91
92 // but preserving base & on to the next
93 lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
94 hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
95 dst16 += 8;
96 count -= 8;
97 fx += dx8;
98 };
99 xy = (uint32_t *) dst16;
100 }
101
102 uint16_t* xx = (uint16_t*)xy;
103 for (int i = count; i > 0; --i) {
104 *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
105 fx += dx;
106 }
107 }
108
AFFINE_NOFILTER_NAME(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)109 static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
110 uint32_t xy[], int count, int x, int y) {
111 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
112 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
113 SkMatrix::kScale_Mask |
114 SkMatrix::kAffine_Mask)) == 0);
115
116 PREAMBLE(s);
117 const SkBitmapProcStateAutoMapper mapper(s, x, y);
118
119 SkFractionalInt fx = mapper.fractionalIntX();
120 SkFractionalInt fy = mapper.fractionalIntY();
121 SkFractionalInt dx = s.fInvSxFractionalInt;
122 SkFractionalInt dy = s.fInvKyFractionalInt;
123 int maxX = s.fPixmap.width() - 1;
124 int maxY = s.fPixmap.height() - 1;
125
126 if (count >= 8) {
127 SkFractionalInt dx4 = dx * 4;
128 SkFractionalInt dy4 = dy * 4;
129 SkFractionalInt dx8 = dx * 8;
130 SkFractionalInt dy8 = dy * 8;
131
132 int32x4_t xbase, ybase;
133 int32x4_t x2base, y2base;
134 int16_t *dst16 = (int16_t *) xy;
135
136 // now build fx, fx+dx, fx+2dx, fx+3dx
137 xbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
138 xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1);
139 xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2);
140 xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3);
141
142 // same for fy
143 ybase = vdupq_n_s32(SkFractionalIntToFixed(fy));
144 ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1);
145 ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2);
146 ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3);
147
148 x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
149 y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4)));
150
151 // store & bump
152 do {
153 int16x8x2_t hi16;
154
155 hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX);
156 hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY);
157
158 vst2q_s16(dst16, hi16);
159
160 // moving base and on to the next
161 xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
162 ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
163 x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
164 y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
165
166 dst16 += 16; // 8x32 aka 16x16
167 count -= 8;
168 fx += dx8;
169 fy += dy8;
170 } while (count >= 8);
171 xy = (uint32_t *) dst16;
172 }
173
174 for (int i = count; i > 0; --i) {
175 *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) |
176 TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
177 fx += dx; fy += dy;
178 }
179 }
180
PACK_FILTER_Y_NAME(SkFixed f,unsigned max,SkFixed one PREAMBLE_PARAM_Y)181 static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
182 SkFixed one PREAMBLE_PARAM_Y) {
183 unsigned i = TILEY_PROCF(f, max);
184 i = (i << 4) | EXTRACT_LOW_BITS(f, max);
185 return (i << 14) | (TILEY_PROCF((f + one), max));
186 }
187
PACK_FILTER_X_NAME(SkFixed f,unsigned max,SkFixed one PREAMBLE_PARAM_X)188 static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
189 SkFixed one PREAMBLE_PARAM_X) {
190 unsigned i = TILEX_PROCF(f, max);
191 i = (i << 4) | EXTRACT_LOW_BITS(f, max);
192 return (i << 14) | (TILEX_PROCF((f + one), max));
193 }
194
PACK_FILTER_X4_NAME(int32x4_t f,unsigned max,SkFixed one PREAMBLE_PARAM_X)195 static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max,
196 SkFixed one PREAMBLE_PARAM_X) {
197 int32x4_t ret, res, wide_one;
198
199 // Prepare constants
200 wide_one = vdupq_n_s32(one);
201
202 // Step 1
203 res = TILEX_PROCF_NEON4(f, max);
204
205 // Step 2
206 ret = EXTRACT_LOW_BITS_NEON4(f, max);
207 ret = vsliq_n_s32(ret, res, 4);
208
209 // Step 3
210 res = TILEX_PROCF_NEON4(f + wide_one, max);
211 ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
212
213 return ret;
214 }
215
PACK_FILTER_Y4_NAME(int32x4_t f,unsigned max,SkFixed one PREAMBLE_PARAM_X)216 static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max,
217 SkFixed one PREAMBLE_PARAM_X) {
218 int32x4_t ret, res, wide_one;
219
220 // Prepare constants
221 wide_one = vdupq_n_s32(one);
222
223 // Step 1
224 res = TILEY_PROCF_NEON4(f, max);
225
226 // Step 2
227 ret = EXTRACT_LOW_BITS_NEON4(f, max);
228 ret = vsliq_n_s32(ret, res, 4);
229
230 // Step 3
231 res = TILEY_PROCF_NEON4(f + wide_one, max);
232 ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
233
234 return ret;
235 }
236
SCALE_FILTER_NAME(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)237 static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
238 uint32_t xy[], int count, int x, int y) {
239 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
240 SkMatrix::kScale_Mask)) == 0);
241 SkASSERT(s.fInvKy == 0);
242
243 PREAMBLE(s);
244
245 const unsigned maxX = s.fPixmap.width() - 1;
246 const SkFixed one = s.fFilterOneX;
247 const SkFractionalInt dx = s.fInvSxFractionalInt;
248 SkFractionalInt fx;
249
250 {
251 const SkBitmapProcStateAutoMapper mapper(s, x, y);
252 const SkFixed fy = mapper.fixedY();
253 const unsigned maxY = s.fPixmap.height() - 1;
254 // compute our two Y values up front
255 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
256 // now initialize fx
257 fx = mapper.fractionalIntX();
258 }
259
260 #ifdef CHECK_FOR_DECAL
261 // test if we don't need to apply the tile proc
262 const SkFixed fixedFx = SkFractionalIntToFixed(fx);
263 const SkFixed fixedDx = SkFractionalIntToFixed(dx);
264 if (can_truncate_to_fixed_for_decal(fixedFx, fixedDx, count, maxX)) {
265 decal_filter_scale_neon(xy, fixedFx, fixedDx, count);
266 return;
267 }
268 #endif
269 {
270
271 if (count >= 4) {
272 int32x4_t wide_fx;
273
274 wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx));
275 wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1);
276 wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2);
277 wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3);
278
279 while (count >= 4) {
280 int32x4_t res;
281
282 res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X);
283
284 vst1q_u32(xy, vreinterpretq_u32_s32(res));
285
286 wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx));
287 fx += dx+dx+dx+dx;
288 xy += 4;
289 count -= 4;
290 }
291 }
292
293 while (--count >= 0) {
294 *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X);
295 fx += dx;
296 }
297
298 }
299 }
300
AFFINE_FILTER_NAME(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)301 static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
302 uint32_t xy[], int count, int x, int y) {
303 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
304 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
305 SkMatrix::kScale_Mask |
306 SkMatrix::kAffine_Mask)) == 0);
307
308 PREAMBLE(s);
309 const SkBitmapProcStateAutoMapper mapper(s, x, y);
310
311 SkFixed oneX = s.fFilterOneX;
312 SkFixed oneY = s.fFilterOneY;
313 SkFixed fx = mapper.fixedX();
314 SkFixed fy = mapper.fixedY();
315 SkFixed dx = s.fInvSx;
316 SkFixed dy = s.fInvKy;
317 unsigned maxX = s.fPixmap.width() - 1;
318 unsigned maxY = s.fPixmap.height() - 1;
319
320 if (count >= 4) {
321 int32x4_t wide_fy, wide_fx;
322
323 wide_fx = vdupq_n_s32(fx);
324 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
325 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
326 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
327
328 wide_fy = vdupq_n_s32(fy);
329 wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1);
330 wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2);
331 wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3);
332
333 while (count >= 4) {
334 int32x4x2_t vxy;
335
336 // do the X side, then the Y side, then interleave them
337 vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y);
338 vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X);
339
340 // interleave as YXYXYXYX as part of the storing
341 vst2q_s32((int32_t*)xy, vxy);
342
343 // prepare next iteration
344 wide_fx += vdupq_n_s32(dx+dx+dx+dx);
345 fx += dx + dx + dx + dx;
346 wide_fy += vdupq_n_s32(dy+dy+dy+dy);
347 fy += dy+dy+dy+dy;
348 xy += 8; // 4 x's, 4 y's
349 count -= 4;
350 }
351 }
352
353 while (--count >= 0) {
354 // NB: writing Y/X
355 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
356 fy += dy;
357 *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
358 fx += dx;
359 }
360 }
361
362 const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
363 SCALE_NOFILTER_NAME,
364 SCALE_FILTER_NAME,
365 AFFINE_NOFILTER_NAME,
366 AFFINE_FILTER_NAME,
367 };
368
369 #undef TILEX_PROCF_NEON8
370 #undef TILEY_PROCF_NEON8
371 #undef TILEX_PROCF_NEON4
372 #undef TILEY_PROCF_NEON4
373 #undef EXTRACT_LOW_BITS_NEON4
374
375 #undef MAKENAME
376 #undef TILEX_PROCF
377 #undef TILEY_PROCF
378 #ifdef CHECK_FOR_DECAL
379 #undef CHECK_FOR_DECAL
380 #endif
381
382 #undef SCALE_NOFILTER_NAME
383 #undef SCALE_FILTER_NAME
384 #undef AFFINE_NOFILTER_NAME
385 #undef AFFINE_FILTER_NAME
386
387 #undef PREAMBLE
388 #undef PREAMBLE_PARAM_X
389 #undef PREAMBLE_PARAM_Y
390 #undef PREAMBLE_ARG_X
391 #undef PREAMBLE_ARG_Y
392
393 #undef EXTRACT_LOW_BITS
394