1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC Neon.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
20 !defined(__aarch64__)
21
22 // NEON downscalers with interpolation.
23 // Provided by Fritz Koenig
24
25 // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)26 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
27 uint8* dst, int dst_width) {
28 asm volatile (
29 "1: \n"
30 // load even pixels into q0, odd into q1
31 MEMACCESS(0)
32 "vld2.8 {q0, q1}, [%0]! \n"
33 "subs %2, %2, #16 \n" // 16 processed per loop
34 MEMACCESS(1)
35 "vst1.8 {q1}, [%1]! \n" // store odd pixels
36 "bgt 1b \n"
37 : "+r"(src_ptr), // %0
38 "+r"(dst), // %1
39 "+r"(dst_width) // %2
40 :
41 : "q0", "q1" // Clobber List
42 );
43 }
44
45 // Read 32x1 average down and write 16x1.
ScaleRowDown2Linear_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)46 void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
47 uint8* dst, int dst_width) {
48 asm volatile (
49 "1: \n"
50 MEMACCESS(0)
51 "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc
52 "subs %2, %2, #16 \n" // 16 processed per loop
53 "vpaddl.u8 q0, q0 \n" // add adjacent
54 "vpaddl.u8 q1, q1 \n"
55 "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
56 "vrshrn.u16 d1, q1, #1 \n"
57 MEMACCESS(1)
58 "vst1.8 {q0}, [%1]! \n"
59 "bgt 1b \n"
60 : "+r"(src_ptr), // %0
61 "+r"(dst), // %1
62 "+r"(dst_width) // %2
63 :
64 : "q0", "q1" // Clobber List
65 );
66 }
67
68 // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)69 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
70 uint8* dst, int dst_width) {
71 asm volatile (
72 // change the stride to row 2 pointer
73 "add %1, %0 \n"
74 "1: \n"
75 MEMACCESS(0)
76 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
77 MEMACCESS(1)
78 "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
79 "subs %3, %3, #16 \n" // 16 processed per loop
80 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
81 "vpaddl.u8 q1, q1 \n"
82 "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
83 "vpadal.u8 q1, q3 \n"
84 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
85 "vrshrn.u16 d1, q1, #2 \n"
86 MEMACCESS(2)
87 "vst1.8 {q0}, [%2]! \n"
88 "bgt 1b \n"
89 : "+r"(src_ptr), // %0
90 "+r"(src_stride), // %1
91 "+r"(dst), // %2
92 "+r"(dst_width) // %3
93 :
94 : "q0", "q1", "q2", "q3" // Clobber List
95 );
96 }
97
ScaleRowDown4_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)98 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
99 uint8* dst_ptr, int dst_width) {
100 asm volatile (
101 "1: \n"
102 MEMACCESS(0)
103 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
104 "subs %2, %2, #8 \n" // 8 processed per loop
105 MEMACCESS(1)
106 "vst1.8 {d2}, [%1]! \n"
107 "bgt 1b \n"
108 : "+r"(src_ptr), // %0
109 "+r"(dst_ptr), // %1
110 "+r"(dst_width) // %2
111 :
112 : "q0", "q1", "memory", "cc"
113 );
114 }
115
ScaleRowDown4Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)116 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
117 uint8* dst_ptr, int dst_width) {
118 const uint8* src_ptr1 = src_ptr + src_stride;
119 const uint8* src_ptr2 = src_ptr + src_stride * 2;
120 const uint8* src_ptr3 = src_ptr + src_stride * 3;
121 asm volatile (
122 "1: \n"
123 MEMACCESS(0)
124 "vld1.8 {q0}, [%0]! \n" // load up 16x4
125 MEMACCESS(3)
126 "vld1.8 {q1}, [%3]! \n"
127 MEMACCESS(4)
128 "vld1.8 {q2}, [%4]! \n"
129 MEMACCESS(5)
130 "vld1.8 {q3}, [%5]! \n"
131 "subs %2, %2, #4 \n"
132 "vpaddl.u8 q0, q0 \n"
133 "vpadal.u8 q0, q1 \n"
134 "vpadal.u8 q0, q2 \n"
135 "vpadal.u8 q0, q3 \n"
136 "vpaddl.u16 q0, q0 \n"
137 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
138 "vmovn.u16 d0, q0 \n"
139 MEMACCESS(1)
140 "vst1.32 {d0[0]}, [%1]! \n"
141 "bgt 1b \n"
142 : "+r"(src_ptr), // %0
143 "+r"(dst_ptr), // %1
144 "+r"(dst_width), // %2
145 "+r"(src_ptr1), // %3
146 "+r"(src_ptr2), // %4
147 "+r"(src_ptr3) // %5
148 :
149 : "q0", "q1", "q2", "q3", "memory", "cc"
150 );
151 }
152
153 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
154 // to load up the every 4th pixel into a 4 different registers.
155 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)156 void ScaleRowDown34_NEON(const uint8* src_ptr,
157 ptrdiff_t src_stride,
158 uint8* dst_ptr, int dst_width) {
159 asm volatile (
160 "1: \n"
161 MEMACCESS(0)
162 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
163 "subs %2, %2, #24 \n"
164 "vmov d2, d3 \n" // order d0, d1, d2
165 MEMACCESS(1)
166 "vst3.8 {d0, d1, d2}, [%1]! \n"
167 "bgt 1b \n"
168 : "+r"(src_ptr), // %0
169 "+r"(dst_ptr), // %1
170 "+r"(dst_width) // %2
171 :
172 : "d0", "d1", "d2", "d3", "memory", "cc"
173 );
174 }
175
ScaleRowDown34_0_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)176 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
177 ptrdiff_t src_stride,
178 uint8* dst_ptr, int dst_width) {
179 asm volatile (
180 "vmov.u8 d24, #3 \n"
181 "add %3, %0 \n"
182 "1: \n"
183 MEMACCESS(0)
184 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
185 MEMACCESS(3)
186 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
187 "subs %2, %2, #24 \n"
188
189 // filter src line 0 with src line 1
190 // expand chars to shorts to allow for room
191 // when adding lines together
192 "vmovl.u8 q8, d4 \n"
193 "vmovl.u8 q9, d5 \n"
194 "vmovl.u8 q10, d6 \n"
195 "vmovl.u8 q11, d7 \n"
196
197 // 3 * line_0 + line_1
198 "vmlal.u8 q8, d0, d24 \n"
199 "vmlal.u8 q9, d1, d24 \n"
200 "vmlal.u8 q10, d2, d24 \n"
201 "vmlal.u8 q11, d3, d24 \n"
202
203 // (3 * line_0 + line_1) >> 2
204 "vqrshrn.u16 d0, q8, #2 \n"
205 "vqrshrn.u16 d1, q9, #2 \n"
206 "vqrshrn.u16 d2, q10, #2 \n"
207 "vqrshrn.u16 d3, q11, #2 \n"
208
209 // a0 = (src[0] * 3 + s[1] * 1) >> 2
210 "vmovl.u8 q8, d1 \n"
211 "vmlal.u8 q8, d0, d24 \n"
212 "vqrshrn.u16 d0, q8, #2 \n"
213
214 // a1 = (src[1] * 1 + s[2] * 1) >> 1
215 "vrhadd.u8 d1, d1, d2 \n"
216
217 // a2 = (src[2] * 1 + s[3] * 3) >> 2
218 "vmovl.u8 q8, d2 \n"
219 "vmlal.u8 q8, d3, d24 \n"
220 "vqrshrn.u16 d2, q8, #2 \n"
221
222 MEMACCESS(1)
223 "vst3.8 {d0, d1, d2}, [%1]! \n"
224
225 "bgt 1b \n"
226 : "+r"(src_ptr), // %0
227 "+r"(dst_ptr), // %1
228 "+r"(dst_width), // %2
229 "+r"(src_stride) // %3
230 :
231 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
232 );
233 }
234
ScaleRowDown34_1_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)235 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
236 ptrdiff_t src_stride,
237 uint8* dst_ptr, int dst_width) {
238 asm volatile (
239 "vmov.u8 d24, #3 \n"
240 "add %3, %0 \n"
241 "1: \n"
242 MEMACCESS(0)
243 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
244 MEMACCESS(3)
245 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
246 "subs %2, %2, #24 \n"
247 // average src line 0 with src line 1
248 "vrhadd.u8 q0, q0, q2 \n"
249 "vrhadd.u8 q1, q1, q3 \n"
250
251 // a0 = (src[0] * 3 + s[1] * 1) >> 2
252 "vmovl.u8 q3, d1 \n"
253 "vmlal.u8 q3, d0, d24 \n"
254 "vqrshrn.u16 d0, q3, #2 \n"
255
256 // a1 = (src[1] * 1 + s[2] * 1) >> 1
257 "vrhadd.u8 d1, d1, d2 \n"
258
259 // a2 = (src[2] * 1 + s[3] * 3) >> 2
260 "vmovl.u8 q3, d2 \n"
261 "vmlal.u8 q3, d3, d24 \n"
262 "vqrshrn.u16 d2, q3, #2 \n"
263
264 MEMACCESS(1)
265 "vst3.8 {d0, d1, d2}, [%1]! \n"
266 "bgt 1b \n"
267 : "+r"(src_ptr), // %0
268 "+r"(dst_ptr), // %1
269 "+r"(dst_width), // %2
270 "+r"(src_stride) // %3
271 :
272 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
273 );
274 }
275
276 #define HAS_SCALEROWDOWN38_NEON
277 static uvec8 kShuf38 =
278 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
279 static uvec8 kShuf38_2 =
280 { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
281 static vec16 kMult38_Div6 =
282 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
283 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
284 static vec16 kMult38_Div9 =
285 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
286 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
287
288 // 32 -> 12
ScaleRowDown38_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)289 void ScaleRowDown38_NEON(const uint8* src_ptr,
290 ptrdiff_t src_stride,
291 uint8* dst_ptr, int dst_width) {
292 asm volatile (
293 MEMACCESS(3)
294 "vld1.8 {q3}, [%3] \n"
295 "1: \n"
296 MEMACCESS(0)
297 "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
298 "subs %2, %2, #12 \n"
299 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
300 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
301 MEMACCESS(1)
302 "vst1.8 {d4}, [%1]! \n"
303 MEMACCESS(1)
304 "vst1.32 {d5[0]}, [%1]! \n"
305 "bgt 1b \n"
306 : "+r"(src_ptr), // %0
307 "+r"(dst_ptr), // %1
308 "+r"(dst_width) // %2
309 : "r"(&kShuf38) // %3
310 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
311 );
312 }
313
314 // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)315 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
316 ptrdiff_t src_stride,
317 uint8* dst_ptr, int dst_width) {
318 const uint8* src_ptr1 = src_ptr + src_stride * 2;
319
320 asm volatile (
321 MEMACCESS(5)
322 "vld1.16 {q13}, [%5] \n"
323 MEMACCESS(6)
324 "vld1.8 {q14}, [%6] \n"
325 MEMACCESS(7)
326 "vld1.8 {q15}, [%7] \n"
327 "add %3, %0 \n"
328 "1: \n"
329
330 // d0 = 00 40 01 41 02 42 03 43
331 // d1 = 10 50 11 51 12 52 13 53
332 // d2 = 20 60 21 61 22 62 23 63
333 // d3 = 30 70 31 71 32 72 33 73
334 MEMACCESS(0)
335 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
336 MEMACCESS(3)
337 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
338 MEMACCESS(4)
339 "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
340 "subs %2, %2, #12 \n"
341
342 // Shuffle the input data around to get align the data
343 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
344 // d0 = 00 10 01 11 02 12 03 13
345 // d1 = 40 50 41 51 42 52 43 53
346 "vtrn.u8 d0, d1 \n"
347 "vtrn.u8 d4, d5 \n"
348 "vtrn.u8 d16, d17 \n"
349
350 // d2 = 20 30 21 31 22 32 23 33
351 // d3 = 60 70 61 71 62 72 63 73
352 "vtrn.u8 d2, d3 \n"
353 "vtrn.u8 d6, d7 \n"
354 "vtrn.u8 d18, d19 \n"
355
356 // d0 = 00+10 01+11 02+12 03+13
357 // d2 = 40+50 41+51 42+52 43+53
358 "vpaddl.u8 q0, q0 \n"
359 "vpaddl.u8 q2, q2 \n"
360 "vpaddl.u8 q8, q8 \n"
361
362 // d3 = 60+70 61+71 62+72 63+73
363 "vpaddl.u8 d3, d3 \n"
364 "vpaddl.u8 d7, d7 \n"
365 "vpaddl.u8 d19, d19 \n"
366
367 // combine source lines
368 "vadd.u16 q0, q2 \n"
369 "vadd.u16 q0, q8 \n"
370 "vadd.u16 d4, d3, d7 \n"
371 "vadd.u16 d4, d19 \n"
372
373 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
374 // + s[6 + st * 1] + s[7 + st * 1]
375 // + s[6 + st * 2] + s[7 + st * 2]) / 6
376 "vqrdmulh.s16 q2, q2, q13 \n"
377 "vmovn.u16 d4, q2 \n"
378
379 // Shuffle 2,3 reg around so that 2 can be added to the
380 // 0,1 reg and 3 can be added to the 4,5 reg. This
381 // requires expanding from u8 to u16 as the 0,1 and 4,5
382 // registers are already expanded. Then do transposes
383 // to get aligned.
384 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
385 "vmovl.u8 q1, d2 \n"
386 "vmovl.u8 q3, d6 \n"
387 "vmovl.u8 q9, d18 \n"
388
389 // combine source lines
390 "vadd.u16 q1, q3 \n"
391 "vadd.u16 q1, q9 \n"
392
393 // d4 = xx 20 xx 30 xx 22 xx 32
394 // d5 = xx 21 xx 31 xx 23 xx 33
395 "vtrn.u32 d2, d3 \n"
396
397 // d4 = xx 20 xx 21 xx 22 xx 23
398 // d5 = xx 30 xx 31 xx 32 xx 33
399 "vtrn.u16 d2, d3 \n"
400
401 // 0+1+2, 3+4+5
402 "vadd.u16 q0, q1 \n"
403
404 // Need to divide, but can't downshift as the the value
405 // isn't a power of 2. So multiply by 65536 / n
406 // and take the upper 16 bits.
407 "vqrdmulh.s16 q0, q0, q15 \n"
408
409 // Align for table lookup, vtbl requires registers to
410 // be adjacent
411 "vmov.u8 d2, d4 \n"
412
413 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
414 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
415
416 MEMACCESS(1)
417 "vst1.8 {d3}, [%1]! \n"
418 MEMACCESS(1)
419 "vst1.32 {d4[0]}, [%1]! \n"
420 "bgt 1b \n"
421 : "+r"(src_ptr), // %0
422 "+r"(dst_ptr), // %1
423 "+r"(dst_width), // %2
424 "+r"(src_stride), // %3
425 "+r"(src_ptr1) // %4
426 : "r"(&kMult38_Div6), // %5
427 "r"(&kShuf38_2), // %6
428 "r"(&kMult38_Div9) // %7
429 : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
430 );
431 }
432
433 // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)434 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
435 ptrdiff_t src_stride,
436 uint8* dst_ptr, int dst_width) {
437 asm volatile (
438 MEMACCESS(4)
439 "vld1.16 {q13}, [%4] \n"
440 MEMACCESS(5)
441 "vld1.8 {q14}, [%5] \n"
442 "add %3, %0 \n"
443 "1: \n"
444
445 // d0 = 00 40 01 41 02 42 03 43
446 // d1 = 10 50 11 51 12 52 13 53
447 // d2 = 20 60 21 61 22 62 23 63
448 // d3 = 30 70 31 71 32 72 33 73
449 MEMACCESS(0)
450 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
451 MEMACCESS(3)
452 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
453 "subs %2, %2, #12 \n"
454
455 // Shuffle the input data around to get align the data
456 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
457 // d0 = 00 10 01 11 02 12 03 13
458 // d1 = 40 50 41 51 42 52 43 53
459 "vtrn.u8 d0, d1 \n"
460 "vtrn.u8 d4, d5 \n"
461
462 // d2 = 20 30 21 31 22 32 23 33
463 // d3 = 60 70 61 71 62 72 63 73
464 "vtrn.u8 d2, d3 \n"
465 "vtrn.u8 d6, d7 \n"
466
467 // d0 = 00+10 01+11 02+12 03+13
468 // d2 = 40+50 41+51 42+52 43+53
469 "vpaddl.u8 q0, q0 \n"
470 "vpaddl.u8 q2, q2 \n"
471
472 // d3 = 60+70 61+71 62+72 63+73
473 "vpaddl.u8 d3, d3 \n"
474 "vpaddl.u8 d7, d7 \n"
475
476 // combine source lines
477 "vadd.u16 q0, q2 \n"
478 "vadd.u16 d4, d3, d7 \n"
479
480 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
481 "vqrshrn.u16 d4, q2, #2 \n"
482
483 // Shuffle 2,3 reg around so that 2 can be added to the
484 // 0,1 reg and 3 can be added to the 4,5 reg. This
485 // requires expanding from u8 to u16 as the 0,1 and 4,5
486 // registers are already expanded. Then do transposes
487 // to get aligned.
488 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
489 "vmovl.u8 q1, d2 \n"
490 "vmovl.u8 q3, d6 \n"
491
492 // combine source lines
493 "vadd.u16 q1, q3 \n"
494
495 // d4 = xx 20 xx 30 xx 22 xx 32
496 // d5 = xx 21 xx 31 xx 23 xx 33
497 "vtrn.u32 d2, d3 \n"
498
499 // d4 = xx 20 xx 21 xx 22 xx 23
500 // d5 = xx 30 xx 31 xx 32 xx 33
501 "vtrn.u16 d2, d3 \n"
502
503 // 0+1+2, 3+4+5
504 "vadd.u16 q0, q1 \n"
505
506 // Need to divide, but can't downshift as the the value
507 // isn't a power of 2. So multiply by 65536 / n
508 // and take the upper 16 bits.
509 "vqrdmulh.s16 q0, q0, q13 \n"
510
511 // Align for table lookup, vtbl requires registers to
512 // be adjacent
513 "vmov.u8 d2, d4 \n"
514
515 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
516 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
517
518 MEMACCESS(1)
519 "vst1.8 {d3}, [%1]! \n"
520 MEMACCESS(1)
521 "vst1.32 {d4[0]}, [%1]! \n"
522 "bgt 1b \n"
523 : "+r"(src_ptr), // %0
524 "+r"(dst_ptr), // %1
525 "+r"(dst_width), // %2
526 "+r"(src_stride) // %3
527 : "r"(&kMult38_Div6), // %4
528 "r"(&kShuf38_2) // %5
529 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
530 );
531 }
532
ScaleAddRows_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint16 * dst_ptr,int src_width,int src_height)533 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
534 uint16* dst_ptr, int src_width, int src_height) {
535 const uint8* src_tmp;
536 asm volatile (
537 "1: \n"
538 "mov %0, %1 \n"
539 "mov r12, %5 \n"
540 "veor q2, q2, q2 \n"
541 "veor q3, q3, q3 \n"
542 "2: \n"
543 // load 16 pixels into q0
544 MEMACCESS(0)
545 "vld1.8 {q0}, [%0], %3 \n"
546 "vaddw.u8 q3, q3, d1 \n"
547 "vaddw.u8 q2, q2, d0 \n"
548 "subs r12, r12, #1 \n"
549 "bgt 2b \n"
550 MEMACCESS(2)
551 "vst1.16 {q2, q3}, [%2]! \n" // store pixels
552 "add %1, %1, #16 \n"
553 "subs %4, %4, #16 \n" // 16 processed per loop
554 "bgt 1b \n"
555 : "=&r"(src_tmp), // %0
556 "+r"(src_ptr), // %1
557 "+r"(dst_ptr), // %2
558 "+r"(src_stride), // %3
559 "+r"(src_width), // %4
560 "+r"(src_height) // %5
561 :
562 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
563 );
564 }
565
566 // TODO(Yang Zhang): Investigate less load instructions for
567 // the x/dx stepping
568 #define LOAD2_DATA8_LANE(n) \
569 "lsr %5, %3, #16 \n" \
570 "add %6, %1, %5 \n" \
571 "add %3, %3, %4 \n" \
572 MEMACCESS(6) \
573 "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
574
575 // The NEON version mimics this formula:
576 // #define BLENDER(a, b, f) (uint8)((int)(a) +
577 // ((int)(f) * ((int)(b) - (int)(a)) >> 16))
578
ScaleFilterCols_NEON(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)579 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
580 int dst_width, int x, int dx) {
581 int dx_offset[4] = {0, 1, 2, 3};
582 int* tmp = dx_offset;
583 const uint8* src_tmp = src_ptr;
584 asm volatile (
585 "vdup.32 q0, %3 \n" // x
586 "vdup.32 q1, %4 \n" // dx
587 "vld1.32 {q2}, [%5] \n" // 0 1 2 3
588 "vshl.i32 q3, q1, #2 \n" // 4 * dx
589 "vmul.s32 q1, q1, q2 \n"
590 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
591 "vadd.s32 q1, q1, q0 \n"
592 // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
593 "vadd.s32 q2, q1, q3 \n"
594 "vshl.i32 q0, q3, #1 \n" // 8 * dx
595 "1: \n"
596 LOAD2_DATA8_LANE(0)
597 LOAD2_DATA8_LANE(1)
598 LOAD2_DATA8_LANE(2)
599 LOAD2_DATA8_LANE(3)
600 LOAD2_DATA8_LANE(4)
601 LOAD2_DATA8_LANE(5)
602 LOAD2_DATA8_LANE(6)
603 LOAD2_DATA8_LANE(7)
604 "vmov q10, q1 \n"
605 "vmov q11, q2 \n"
606 "vuzp.16 q10, q11 \n"
607 "vmovl.u8 q8, d6 \n"
608 "vmovl.u8 q9, d7 \n"
609 "vsubl.s16 q11, d18, d16 \n"
610 "vsubl.s16 q12, d19, d17 \n"
611 "vmovl.u16 q13, d20 \n"
612 "vmovl.u16 q10, d21 \n"
613 "vmul.s32 q11, q11, q13 \n"
614 "vmul.s32 q12, q12, q10 \n"
615 "vrshrn.s32 d18, q11, #16 \n"
616 "vrshrn.s32 d19, q12, #16 \n"
617 "vadd.s16 q8, q8, q9 \n"
618 "vmovn.s16 d6, q8 \n"
619
620 MEMACCESS(0)
621 "vst1.8 {d6}, [%0]! \n" // store pixels
622 "vadd.s32 q1, q1, q0 \n"
623 "vadd.s32 q2, q2, q0 \n"
624 "subs %2, %2, #8 \n" // 8 processed per loop
625 "bgt 1b \n"
626 : "+r"(dst_ptr), // %0
627 "+r"(src_ptr), // %1
628 "+r"(dst_width), // %2
629 "+r"(x), // %3
630 "+r"(dx), // %4
631 "+r"(tmp), // %5
632 "+r"(src_tmp) // %6
633 :
634 : "memory", "cc", "q0", "q1", "q2", "q3",
635 "q8", "q9", "q10", "q11", "q12", "q13"
636 );
637 }
638
639 #undef LOAD2_DATA8_LANE
640
641 // 16x2 -> 16x1
ScaleFilterRows_NEON(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)642 void ScaleFilterRows_NEON(uint8* dst_ptr,
643 const uint8* src_ptr, ptrdiff_t src_stride,
644 int dst_width, int source_y_fraction) {
645 asm volatile (
646 "cmp %4, #0 \n"
647 "beq 100f \n"
648 "add %2, %1 \n"
649 "cmp %4, #64 \n"
650 "beq 75f \n"
651 "cmp %4, #128 \n"
652 "beq 50f \n"
653 "cmp %4, #192 \n"
654 "beq 25f \n"
655
656 "vdup.8 d5, %4 \n"
657 "rsb %4, #256 \n"
658 "vdup.8 d4, %4 \n"
659 // General purpose row blend.
660 "1: \n"
661 MEMACCESS(1)
662 "vld1.8 {q0}, [%1]! \n"
663 MEMACCESS(2)
664 "vld1.8 {q1}, [%2]! \n"
665 "subs %3, %3, #16 \n"
666 "vmull.u8 q13, d0, d4 \n"
667 "vmull.u8 q14, d1, d4 \n"
668 "vmlal.u8 q13, d2, d5 \n"
669 "vmlal.u8 q14, d3, d5 \n"
670 "vrshrn.u16 d0, q13, #8 \n"
671 "vrshrn.u16 d1, q14, #8 \n"
672 MEMACCESS(0)
673 "vst1.8 {q0}, [%0]! \n"
674 "bgt 1b \n"
675 "b 99f \n"
676
677 // Blend 25 / 75.
678 "25: \n"
679 MEMACCESS(1)
680 "vld1.8 {q0}, [%1]! \n"
681 MEMACCESS(2)
682 "vld1.8 {q1}, [%2]! \n"
683 "subs %3, %3, #16 \n"
684 "vrhadd.u8 q0, q1 \n"
685 "vrhadd.u8 q0, q1 \n"
686 MEMACCESS(0)
687 "vst1.8 {q0}, [%0]! \n"
688 "bgt 25b \n"
689 "b 99f \n"
690
691 // Blend 50 / 50.
692 "50: \n"
693 MEMACCESS(1)
694 "vld1.8 {q0}, [%1]! \n"
695 MEMACCESS(2)
696 "vld1.8 {q1}, [%2]! \n"
697 "subs %3, %3, #16 \n"
698 "vrhadd.u8 q0, q1 \n"
699 MEMACCESS(0)
700 "vst1.8 {q0}, [%0]! \n"
701 "bgt 50b \n"
702 "b 99f \n"
703
704 // Blend 75 / 25.
705 "75: \n"
706 MEMACCESS(1)
707 "vld1.8 {q1}, [%1]! \n"
708 MEMACCESS(2)
709 "vld1.8 {q0}, [%2]! \n"
710 "subs %3, %3, #16 \n"
711 "vrhadd.u8 q0, q1 \n"
712 "vrhadd.u8 q0, q1 \n"
713 MEMACCESS(0)
714 "vst1.8 {q0}, [%0]! \n"
715 "bgt 75b \n"
716 "b 99f \n"
717
718 // Blend 100 / 0 - Copy row unchanged.
719 "100: \n"
720 MEMACCESS(1)
721 "vld1.8 {q0}, [%1]! \n"
722 "subs %3, %3, #16 \n"
723 MEMACCESS(0)
724 "vst1.8 {q0}, [%0]! \n"
725 "bgt 100b \n"
726
727 "99: \n"
728 MEMACCESS(0)
729 "vst1.8 {d1[7]}, [%0] \n"
730 : "+r"(dst_ptr), // %0
731 "+r"(src_ptr), // %1
732 "+r"(src_stride), // %2
733 "+r"(dst_width), // %3
734 "+r"(source_y_fraction) // %4
735 :
736 : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
737 );
738 }
739
ScaleARGBRowDown2_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)740 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
741 uint8* dst, int dst_width) {
742 asm volatile (
743 "1: \n"
744 // load even pixels into q0, odd into q1
745 MEMACCESS(0)
746 "vld2.32 {q0, q1}, [%0]! \n"
747 MEMACCESS(0)
748 "vld2.32 {q2, q3}, [%0]! \n"
749 "subs %2, %2, #8 \n" // 8 processed per loop
750 MEMACCESS(1)
751 "vst1.8 {q1}, [%1]! \n" // store odd pixels
752 MEMACCESS(1)
753 "vst1.8 {q3}, [%1]! \n"
754 "bgt 1b \n"
755 : "+r"(src_ptr), // %0
756 "+r"(dst), // %1
757 "+r"(dst_width) // %2
758 :
759 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
760 );
761 }
762
ScaleARGBRowDown2Linear_NEON(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)763 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
764 uint8* dst_argb, int dst_width) {
765 asm volatile (
766 "1: \n"
767 MEMACCESS(0)
768 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
769 MEMACCESS(0)
770 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
771 "subs %2, %2, #8 \n" // 8 processed per loop
772 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
773 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
774 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
775 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
776 "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
777 "vrshrn.u16 d1, q1, #1 \n"
778 "vrshrn.u16 d2, q2, #1 \n"
779 "vrshrn.u16 d3, q3, #1 \n"
780 MEMACCESS(1)
781 "vst4.8 {d0, d1, d2, d3}, [%1]! \n"
782 "bgt 1b \n"
783 : "+r"(src_argb), // %0
784 "+r"(dst_argb), // %1
785 "+r"(dst_width) // %2
786 :
787 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
788 );
789 }
790
ScaleARGBRowDown2Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)791 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
792 uint8* dst, int dst_width) {
793 asm volatile (
794 // change the stride to row 2 pointer
795 "add %1, %1, %0 \n"
796 "1: \n"
797 MEMACCESS(0)
798 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
799 MEMACCESS(0)
800 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
801 "subs %3, %3, #8 \n" // 8 processed per loop.
802 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
803 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
804 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
805 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
806 MEMACCESS(1)
807 "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
808 MEMACCESS(1)
809 "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
810 "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
811 "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
812 "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
813 "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
814 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
815 "vrshrn.u16 d1, q1, #2 \n"
816 "vrshrn.u16 d2, q2, #2 \n"
817 "vrshrn.u16 d3, q3, #2 \n"
818 MEMACCESS(2)
819 "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
820 "bgt 1b \n"
821 : "+r"(src_ptr), // %0
822 "+r"(src_stride), // %1
823 "+r"(dst), // %2
824 "+r"(dst_width) // %3
825 :
826 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
827 );
828 }
829
830 // Reads 4 pixels at a time.
831 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)832 void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
833 int src_stepx, uint8* dst_argb, int dst_width) {
834 asm volatile (
835 "mov r12, %3, lsl #2 \n"
836 "1: \n"
837 MEMACCESS(0)
838 "vld1.32 {d0[0]}, [%0], r12 \n"
839 MEMACCESS(0)
840 "vld1.32 {d0[1]}, [%0], r12 \n"
841 MEMACCESS(0)
842 "vld1.32 {d1[0]}, [%0], r12 \n"
843 MEMACCESS(0)
844 "vld1.32 {d1[1]}, [%0], r12 \n"
845 "subs %2, %2, #4 \n" // 4 pixels per loop.
846 MEMACCESS(1)
847 "vst1.8 {q0}, [%1]! \n"
848 "bgt 1b \n"
849 : "+r"(src_argb), // %0
850 "+r"(dst_argb), // %1
851 "+r"(dst_width) // %2
852 : "r"(src_stepx) // %3
853 : "memory", "cc", "r12", "q0"
854 );
855 }
856
857 // Reads 4 pixels at a time.
858 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEvenBox_NEON(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)859 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
860 int src_stepx,
861 uint8* dst_argb, int dst_width) {
862 asm volatile (
863 "mov r12, %4, lsl #2 \n"
864 "add %1, %1, %0 \n"
865 "1: \n"
866 MEMACCESS(0)
867 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
868 MEMACCESS(1)
869 "vld1.8 {d1}, [%1], r12 \n"
870 MEMACCESS(0)
871 "vld1.8 {d2}, [%0], r12 \n"
872 MEMACCESS(1)
873 "vld1.8 {d3}, [%1], r12 \n"
874 MEMACCESS(0)
875 "vld1.8 {d4}, [%0], r12 \n"
876 MEMACCESS(1)
877 "vld1.8 {d5}, [%1], r12 \n"
878 MEMACCESS(0)
879 "vld1.8 {d6}, [%0], r12 \n"
880 MEMACCESS(1)
881 "vld1.8 {d7}, [%1], r12 \n"
882 "vaddl.u8 q0, d0, d1 \n"
883 "vaddl.u8 q1, d2, d3 \n"
884 "vaddl.u8 q2, d4, d5 \n"
885 "vaddl.u8 q3, d6, d7 \n"
886 "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
887 "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
888 "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
889 "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
890 "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
891 "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
892 "subs %3, %3, #4 \n" // 4 pixels per loop.
893 MEMACCESS(2)
894 "vst1.8 {q0}, [%2]! \n"
895 "bgt 1b \n"
896 : "+r"(src_argb), // %0
897 "+r"(src_stride), // %1
898 "+r"(dst_argb), // %2
899 "+r"(dst_width) // %3
900 : "r"(src_stepx) // %4
901 : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
902 );
903 }
904
905 // TODO(Yang Zhang): Investigate less load instructions for
906 // the x/dx stepping
907 #define LOAD1_DATA32_LANE(dn, n) \
908 "lsr %5, %3, #16 \n" \
909 "add %6, %1, %5, lsl #2 \n" \
910 "add %3, %3, %4 \n" \
911 MEMACCESS(6) \
912 "vld1.32 {"#dn"["#n"]}, [%6] \n"
913
ScaleARGBCols_NEON(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)914 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
915 int dst_width, int x, int dx) {
916 int tmp;
917 const uint8* src_tmp = src_argb;
918 asm volatile (
919 "1: \n"
920 LOAD1_DATA32_LANE(d0, 0)
921 LOAD1_DATA32_LANE(d0, 1)
922 LOAD1_DATA32_LANE(d1, 0)
923 LOAD1_DATA32_LANE(d1, 1)
924 LOAD1_DATA32_LANE(d2, 0)
925 LOAD1_DATA32_LANE(d2, 1)
926 LOAD1_DATA32_LANE(d3, 0)
927 LOAD1_DATA32_LANE(d3, 1)
928
929 MEMACCESS(0)
930 "vst1.32 {q0, q1}, [%0]! \n" // store pixels
931 "subs %2, %2, #8 \n" // 8 processed per loop
932 "bgt 1b \n"
933 : "+r"(dst_argb), // %0
934 "+r"(src_argb), // %1
935 "+r"(dst_width), // %2
936 "+r"(x), // %3
937 "+r"(dx), // %4
938 "=&r"(tmp), // %5
939 "+r"(src_tmp) // %6
940 :
941 : "memory", "cc", "q0", "q1"
942 );
943 }
944
945 #undef LOAD1_DATA32_LANE
946
947 // TODO(Yang Zhang): Investigate less load instructions for
948 // the x/dx stepping
949 #define LOAD2_DATA32_LANE(dn1, dn2, n) \
950 "lsr %5, %3, #16 \n" \
951 "add %6, %1, %5, lsl #2 \n" \
952 "add %3, %3, %4 \n" \
953 MEMACCESS(6) \
954 "vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n"
955
ScaleARGBFilterCols_NEON(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)956 void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
957 int dst_width, int x, int dx) {
958 int dx_offset[4] = {0, 1, 2, 3};
959 int* tmp = dx_offset;
960 const uint8* src_tmp = src_argb;
961 asm volatile (
962 "vdup.32 q0, %3 \n" // x
963 "vdup.32 q1, %4 \n" // dx
964 "vld1.32 {q2}, [%5] \n" // 0 1 2 3
965 "vshl.i32 q9, q1, #2 \n" // 4 * dx
966 "vmul.s32 q1, q1, q2 \n"
967 "vmov.i8 q3, #0x7f \n" // 0x7F
968 "vmov.i16 q15, #0x7f \n" // 0x7F
969 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
970 "vadd.s32 q8, q1, q0 \n"
971 "1: \n"
972 // d0, d1: a
973 // d2, d3: b
974 LOAD2_DATA32_LANE(d0, d2, 0)
975 LOAD2_DATA32_LANE(d0, d2, 1)
976 LOAD2_DATA32_LANE(d1, d3, 0)
977 LOAD2_DATA32_LANE(d1, d3, 1)
978 "vshrn.i32 d22, q8, #9 \n"
979 "vand.16 d22, d22, d30 \n"
980 "vdup.8 d24, d22[0] \n"
981 "vdup.8 d25, d22[2] \n"
982 "vdup.8 d26, d22[4] \n"
983 "vdup.8 d27, d22[6] \n"
984 "vext.8 d4, d24, d25, #4 \n"
985 "vext.8 d5, d26, d27, #4 \n" // f
986 "veor.8 q10, q2, q3 \n" // 0x7f ^ f
987 "vmull.u8 q11, d0, d20 \n"
988 "vmull.u8 q12, d1, d21 \n"
989 "vmull.u8 q13, d2, d4 \n"
990 "vmull.u8 q14, d3, d5 \n"
991 "vadd.i16 q11, q11, q13 \n"
992 "vadd.i16 q12, q12, q14 \n"
993 "vshrn.i16 d0, q11, #7 \n"
994 "vshrn.i16 d1, q12, #7 \n"
995
996 MEMACCESS(0)
997 "vst1.32 {d0, d1}, [%0]! \n" // store pixels
998 "vadd.s32 q8, q8, q9 \n"
999 "subs %2, %2, #4 \n" // 4 processed per loop
1000 "bgt 1b \n"
1001 : "+r"(dst_argb), // %0
1002 "+r"(src_argb), // %1
1003 "+r"(dst_width), // %2
1004 "+r"(x), // %3
1005 "+r"(dx), // %4
1006 "+r"(tmp), // %5
1007 "+r"(src_tmp) // %6
1008 :
1009 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
1010 "q10", "q11", "q12", "q13", "q14", "q15"
1011 );
1012 }
1013
1014 #undef LOAD2_DATA32_LANE
1015
1016 #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
1017
1018 #ifdef __cplusplus
1019 } // extern "C"
1020 } // namespace libyuv
1021 #endif
1022