1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC Neon.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
20
21 // NEON downscalers with interpolation.
22 // Provided by Fritz Koenig
23
24 // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)25 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
26 uint8* dst, int dst_width) {
27 asm volatile (
28 ".p2align 2 \n"
29 "1: \n"
30 // load even pixels into q0, odd into q1
31 MEMACCESS(0)
32 "vld2.8 {q0, q1}, [%0]! \n"
33 "subs %2, %2, #16 \n" // 16 processed per loop
34 MEMACCESS(1)
35 "vst1.8 {q1}, [%1]! \n" // store odd pixels
36 "bgt 1b \n"
37 : "+r"(src_ptr), // %0
38 "+r"(dst), // %1
39 "+r"(dst_width) // %2
40 :
41 : "q0", "q1" // Clobber List
42 );
43 }
44
45 // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)46 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
47 uint8* dst, int dst_width) {
48 asm volatile (
49 // change the stride to row 2 pointer
50 "add %1, %0 \n"
51 ".p2align 2 \n"
52 "1: \n"
53 MEMACCESS(0)
54 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
55 MEMACCESS(1)
56 "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
57 "subs %3, %3, #16 \n" // 16 processed per loop
58 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
59 "vpaddl.u8 q1, q1 \n"
60 "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
61 "vpadal.u8 q1, q3 \n"
62 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
63 "vrshrn.u16 d1, q1, #2 \n"
64 MEMACCESS(2)
65 "vst1.8 {q0}, [%2]! \n"
66 "bgt 1b \n"
67 : "+r"(src_ptr), // %0
68 "+r"(src_stride), // %1
69 "+r"(dst), // %2
70 "+r"(dst_width) // %3
71 :
72 : "q0", "q1", "q2", "q3" // Clobber List
73 );
74 }
75
ScaleRowDown4_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)76 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
77 uint8* dst_ptr, int dst_width) {
78 asm volatile (
79 ".p2align 2 \n"
80 "1: \n"
81 MEMACCESS(0)
82 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
83 "subs %2, %2, #8 \n" // 8 processed per loop
84 MEMACCESS(1)
85 "vst1.8 {d2}, [%1]! \n"
86 "bgt 1b \n"
87 : "+r"(src_ptr), // %0
88 "+r"(dst_ptr), // %1
89 "+r"(dst_width) // %2
90 :
91 : "q0", "q1", "memory", "cc"
92 );
93 }
94
ScaleRowDown4Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)95 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
96 uint8* dst_ptr, int dst_width) {
97 const uint8* src_ptr1 = src_ptr + src_stride;
98 const uint8* src_ptr2 = src_ptr + src_stride * 2;
99 const uint8* src_ptr3 = src_ptr + src_stride * 3;
100 asm volatile (
101 ".p2align 2 \n"
102 "1: \n"
103 MEMACCESS(0)
104 "vld1.8 {q0}, [%0]! \n" // load up 16x4
105 MEMACCESS(3)
106 "vld1.8 {q1}, [%3]! \n"
107 MEMACCESS(4)
108 "vld1.8 {q2}, [%4]! \n"
109 MEMACCESS(5)
110 "vld1.8 {q3}, [%5]! \n"
111 "subs %2, %2, #4 \n"
112 "vpaddl.u8 q0, q0 \n"
113 "vpadal.u8 q0, q1 \n"
114 "vpadal.u8 q0, q2 \n"
115 "vpadal.u8 q0, q3 \n"
116 "vpaddl.u16 q0, q0 \n"
117 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
118 "vmovn.u16 d0, q0 \n"
119 MEMACCESS(1)
120 "vst1.32 {d0[0]}, [%1]! \n"
121 "bgt 1b \n"
122 : "+r"(src_ptr), // %0
123 "+r"(dst_ptr), // %1
124 "+r"(dst_width), // %2
125 "+r"(src_ptr1), // %3
126 "+r"(src_ptr2), // %4
127 "+r"(src_ptr3) // %5
128 :
129 : "q0", "q1", "q2", "q3", "memory", "cc"
130 );
131 }
132
133 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
134 // to load up the every 4th pixel into a 4 different registers.
135 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)136 void ScaleRowDown34_NEON(const uint8* src_ptr,
137 ptrdiff_t src_stride,
138 uint8* dst_ptr, int dst_width) {
139 asm volatile (
140 ".p2align 2 \n"
141 "1: \n"
142 MEMACCESS(0)
143 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
144 "subs %2, %2, #24 \n"
145 "vmov d2, d3 \n" // order d0, d1, d2
146 MEMACCESS(1)
147 "vst3.8 {d0, d1, d2}, [%1]! \n"
148 "bgt 1b \n"
149 : "+r"(src_ptr), // %0
150 "+r"(dst_ptr), // %1
151 "+r"(dst_width) // %2
152 :
153 : "d0", "d1", "d2", "d3", "memory", "cc"
154 );
155 }
156
ScaleRowDown34_0_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)157 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
158 ptrdiff_t src_stride,
159 uint8* dst_ptr, int dst_width) {
160 asm volatile (
161 "vmov.u8 d24, #3 \n"
162 "add %3, %0 \n"
163 ".p2align 2 \n"
164 "1: \n"
165 MEMACCESS(0)
166 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
167 MEMACCESS(3)
168 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
169 "subs %2, %2, #24 \n"
170
171 // filter src line 0 with src line 1
172 // expand chars to shorts to allow for room
173 // when adding lines together
174 "vmovl.u8 q8, d4 \n"
175 "vmovl.u8 q9, d5 \n"
176 "vmovl.u8 q10, d6 \n"
177 "vmovl.u8 q11, d7 \n"
178
179 // 3 * line_0 + line_1
180 "vmlal.u8 q8, d0, d24 \n"
181 "vmlal.u8 q9, d1, d24 \n"
182 "vmlal.u8 q10, d2, d24 \n"
183 "vmlal.u8 q11, d3, d24 \n"
184
185 // (3 * line_0 + line_1) >> 2
186 "vqrshrn.u16 d0, q8, #2 \n"
187 "vqrshrn.u16 d1, q9, #2 \n"
188 "vqrshrn.u16 d2, q10, #2 \n"
189 "vqrshrn.u16 d3, q11, #2 \n"
190
191 // a0 = (src[0] * 3 + s[1] * 1) >> 2
192 "vmovl.u8 q8, d1 \n"
193 "vmlal.u8 q8, d0, d24 \n"
194 "vqrshrn.u16 d0, q8, #2 \n"
195
196 // a1 = (src[1] * 1 + s[2] * 1) >> 1
197 "vrhadd.u8 d1, d1, d2 \n"
198
199 // a2 = (src[2] * 1 + s[3] * 3) >> 2
200 "vmovl.u8 q8, d2 \n"
201 "vmlal.u8 q8, d3, d24 \n"
202 "vqrshrn.u16 d2, q8, #2 \n"
203
204 MEMACCESS(1)
205 "vst3.8 {d0, d1, d2}, [%1]! \n"
206
207 "bgt 1b \n"
208 : "+r"(src_ptr), // %0
209 "+r"(dst_ptr), // %1
210 "+r"(dst_width), // %2
211 "+r"(src_stride) // %3
212 :
213 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
214 );
215 }
216
ScaleRowDown34_1_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)217 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
218 ptrdiff_t src_stride,
219 uint8* dst_ptr, int dst_width) {
220 asm volatile (
221 "vmov.u8 d24, #3 \n"
222 "add %3, %0 \n"
223 ".p2align 2 \n"
224 "1: \n"
225 MEMACCESS(0)
226 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
227 MEMACCESS(3)
228 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
229 "subs %2, %2, #24 \n"
230 // average src line 0 with src line 1
231 "vrhadd.u8 q0, q0, q2 \n"
232 "vrhadd.u8 q1, q1, q3 \n"
233
234 // a0 = (src[0] * 3 + s[1] * 1) >> 2
235 "vmovl.u8 q3, d1 \n"
236 "vmlal.u8 q3, d0, d24 \n"
237 "vqrshrn.u16 d0, q3, #2 \n"
238
239 // a1 = (src[1] * 1 + s[2] * 1) >> 1
240 "vrhadd.u8 d1, d1, d2 \n"
241
242 // a2 = (src[2] * 1 + s[3] * 3) >> 2
243 "vmovl.u8 q3, d2 \n"
244 "vmlal.u8 q3, d3, d24 \n"
245 "vqrshrn.u16 d2, q3, #2 \n"
246
247 MEMACCESS(1)
248 "vst3.8 {d0, d1, d2}, [%1]! \n"
249 "bgt 1b \n"
250 : "+r"(src_ptr), // %0
251 "+r"(dst_ptr), // %1
252 "+r"(dst_width), // %2
253 "+r"(src_stride) // %3
254 :
255 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
256 );
257 }
258
259 #define HAS_SCALEROWDOWN38_NEON
260 static uvec8 kShuf38 =
261 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
262 static uvec8 kShuf38_2 =
263 { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
264 static vec16 kMult38_Div6 =
265 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
266 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
267 static vec16 kMult38_Div9 =
268 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
269 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
270
271 // 32 -> 12
ScaleRowDown38_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)272 void ScaleRowDown38_NEON(const uint8* src_ptr,
273 ptrdiff_t src_stride,
274 uint8* dst_ptr, int dst_width) {
275 asm volatile (
276 MEMACCESS(3)
277 "vld1.8 {q3}, [%3] \n"
278 ".p2align 2 \n"
279 "1: \n"
280 MEMACCESS(0)
281 "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
282 "subs %2, %2, #12 \n"
283 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
284 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
285 MEMACCESS(1)
286 "vst1.8 {d4}, [%1]! \n"
287 MEMACCESS(1)
288 "vst1.32 {d5[0]}, [%1]! \n"
289 "bgt 1b \n"
290 : "+r"(src_ptr), // %0
291 "+r"(dst_ptr), // %1
292 "+r"(dst_width) // %2
293 : "r"(&kShuf38) // %3
294 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
295 );
296 }
297
298 // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)299 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
300 ptrdiff_t src_stride,
301 uint8* dst_ptr, int dst_width) {
302 const uint8* src_ptr1 = src_ptr + src_stride * 2;
303
304 asm volatile (
305 MEMACCESS(5)
306 "vld1.16 {q13}, [%5] \n"
307 MEMACCESS(6)
308 "vld1.8 {q14}, [%6] \n"
309 MEMACCESS(7)
310 "vld1.8 {q15}, [%7] \n"
311 "add %3, %0 \n"
312 ".p2align 2 \n"
313 "1: \n"
314
315 // d0 = 00 40 01 41 02 42 03 43
316 // d1 = 10 50 11 51 12 52 13 53
317 // d2 = 20 60 21 61 22 62 23 63
318 // d3 = 30 70 31 71 32 72 33 73
319 MEMACCESS(0)
320 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
321 MEMACCESS(3)
322 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
323 MEMACCESS(4)
324 "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
325 "subs %2, %2, #12 \n"
326
327 // Shuffle the input data around to get align the data
328 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
329 // d0 = 00 10 01 11 02 12 03 13
330 // d1 = 40 50 41 51 42 52 43 53
331 "vtrn.u8 d0, d1 \n"
332 "vtrn.u8 d4, d5 \n"
333 "vtrn.u8 d16, d17 \n"
334
335 // d2 = 20 30 21 31 22 32 23 33
336 // d3 = 60 70 61 71 62 72 63 73
337 "vtrn.u8 d2, d3 \n"
338 "vtrn.u8 d6, d7 \n"
339 "vtrn.u8 d18, d19 \n"
340
341 // d0 = 00+10 01+11 02+12 03+13
342 // d2 = 40+50 41+51 42+52 43+53
343 "vpaddl.u8 q0, q0 \n"
344 "vpaddl.u8 q2, q2 \n"
345 "vpaddl.u8 q8, q8 \n"
346
347 // d3 = 60+70 61+71 62+72 63+73
348 "vpaddl.u8 d3, d3 \n"
349 "vpaddl.u8 d7, d7 \n"
350 "vpaddl.u8 d19, d19 \n"
351
352 // combine source lines
353 "vadd.u16 q0, q2 \n"
354 "vadd.u16 q0, q8 \n"
355 "vadd.u16 d4, d3, d7 \n"
356 "vadd.u16 d4, d19 \n"
357
358 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
359 // + s[6 + st * 1] + s[7 + st * 1]
360 // + s[6 + st * 2] + s[7 + st * 2]) / 6
361 "vqrdmulh.s16 q2, q2, q13 \n"
362 "vmovn.u16 d4, q2 \n"
363
364 // Shuffle 2,3 reg around so that 2 can be added to the
365 // 0,1 reg and 3 can be added to the 4,5 reg. This
366 // requires expanding from u8 to u16 as the 0,1 and 4,5
367 // registers are already expanded. Then do transposes
368 // to get aligned.
369 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
370 "vmovl.u8 q1, d2 \n"
371 "vmovl.u8 q3, d6 \n"
372 "vmovl.u8 q9, d18 \n"
373
374 // combine source lines
375 "vadd.u16 q1, q3 \n"
376 "vadd.u16 q1, q9 \n"
377
378 // d4 = xx 20 xx 30 xx 22 xx 32
379 // d5 = xx 21 xx 31 xx 23 xx 33
380 "vtrn.u32 d2, d3 \n"
381
382 // d4 = xx 20 xx 21 xx 22 xx 23
383 // d5 = xx 30 xx 31 xx 32 xx 33
384 "vtrn.u16 d2, d3 \n"
385
386 // 0+1+2, 3+4+5
387 "vadd.u16 q0, q1 \n"
388
389 // Need to divide, but can't downshift as the the value
390 // isn't a power of 2. So multiply by 65536 / n
391 // and take the upper 16 bits.
392 "vqrdmulh.s16 q0, q0, q15 \n"
393
394 // Align for table lookup, vtbl requires registers to
395 // be adjacent
396 "vmov.u8 d2, d4 \n"
397
398 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
399 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
400
401 MEMACCESS(1)
402 "vst1.8 {d3}, [%1]! \n"
403 MEMACCESS(1)
404 "vst1.32 {d4[0]}, [%1]! \n"
405 "bgt 1b \n"
406 : "+r"(src_ptr), // %0
407 "+r"(dst_ptr), // %1
408 "+r"(dst_width), // %2
409 "+r"(src_stride), // %3
410 "+r"(src_ptr1) // %4
411 : "r"(&kMult38_Div6), // %5
412 "r"(&kShuf38_2), // %6
413 "r"(&kMult38_Div9) // %7
414 : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
415 );
416 }
417
418 // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)419 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
420 ptrdiff_t src_stride,
421 uint8* dst_ptr, int dst_width) {
422 asm volatile (
423 MEMACCESS(4)
424 "vld1.16 {q13}, [%4] \n"
425 MEMACCESS(5)
426 "vld1.8 {q14}, [%5] \n"
427 "add %3, %0 \n"
428 ".p2align 2 \n"
429 "1: \n"
430
431 // d0 = 00 40 01 41 02 42 03 43
432 // d1 = 10 50 11 51 12 52 13 53
433 // d2 = 20 60 21 61 22 62 23 63
434 // d3 = 30 70 31 71 32 72 33 73
435 MEMACCESS(0)
436 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
437 MEMACCESS(3)
438 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
439 "subs %2, %2, #12 \n"
440
441 // Shuffle the input data around to get align the data
442 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
443 // d0 = 00 10 01 11 02 12 03 13
444 // d1 = 40 50 41 51 42 52 43 53
445 "vtrn.u8 d0, d1 \n"
446 "vtrn.u8 d4, d5 \n"
447
448 // d2 = 20 30 21 31 22 32 23 33
449 // d3 = 60 70 61 71 62 72 63 73
450 "vtrn.u8 d2, d3 \n"
451 "vtrn.u8 d6, d7 \n"
452
453 // d0 = 00+10 01+11 02+12 03+13
454 // d2 = 40+50 41+51 42+52 43+53
455 "vpaddl.u8 q0, q0 \n"
456 "vpaddl.u8 q2, q2 \n"
457
458 // d3 = 60+70 61+71 62+72 63+73
459 "vpaddl.u8 d3, d3 \n"
460 "vpaddl.u8 d7, d7 \n"
461
462 // combine source lines
463 "vadd.u16 q0, q2 \n"
464 "vadd.u16 d4, d3, d7 \n"
465
466 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
467 "vqrshrn.u16 d4, q2, #2 \n"
468
469 // Shuffle 2,3 reg around so that 2 can be added to the
470 // 0,1 reg and 3 can be added to the 4,5 reg. This
471 // requires expanding from u8 to u16 as the 0,1 and 4,5
472 // registers are already expanded. Then do transposes
473 // to get aligned.
474 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
475 "vmovl.u8 q1, d2 \n"
476 "vmovl.u8 q3, d6 \n"
477
478 // combine source lines
479 "vadd.u16 q1, q3 \n"
480
481 // d4 = xx 20 xx 30 xx 22 xx 32
482 // d5 = xx 21 xx 31 xx 23 xx 33
483 "vtrn.u32 d2, d3 \n"
484
485 // d4 = xx 20 xx 21 xx 22 xx 23
486 // d5 = xx 30 xx 31 xx 32 xx 33
487 "vtrn.u16 d2, d3 \n"
488
489 // 0+1+2, 3+4+5
490 "vadd.u16 q0, q1 \n"
491
492 // Need to divide, but can't downshift as the the value
493 // isn't a power of 2. So multiply by 65536 / n
494 // and take the upper 16 bits.
495 "vqrdmulh.s16 q0, q0, q13 \n"
496
497 // Align for table lookup, vtbl requires registers to
498 // be adjacent
499 "vmov.u8 d2, d4 \n"
500
501 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
502 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
503
504 MEMACCESS(1)
505 "vst1.8 {d3}, [%1]! \n"
506 MEMACCESS(1)
507 "vst1.32 {d4[0]}, [%1]! \n"
508 "bgt 1b \n"
509 : "+r"(src_ptr), // %0
510 "+r"(dst_ptr), // %1
511 "+r"(dst_width), // %2
512 "+r"(src_stride) // %3
513 : "r"(&kMult38_Div6), // %4
514 "r"(&kShuf38_2) // %5
515 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
516 );
517 }
518
519 // 16x2 -> 16x1
ScaleFilterRows_NEON(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)520 void ScaleFilterRows_NEON(uint8* dst_ptr,
521 const uint8* src_ptr, ptrdiff_t src_stride,
522 int dst_width, int source_y_fraction) {
523 asm volatile (
524 "cmp %4, #0 \n"
525 "beq 100f \n"
526 "add %2, %1 \n"
527 "cmp %4, #64 \n"
528 "beq 75f \n"
529 "cmp %4, #128 \n"
530 "beq 50f \n"
531 "cmp %4, #192 \n"
532 "beq 25f \n"
533
534 "vdup.8 d5, %4 \n"
535 "rsb %4, #256 \n"
536 "vdup.8 d4, %4 \n"
537 // General purpose row blend.
538 "1: \n"
539 MEMACCESS(1)
540 "vld1.8 {q0}, [%1]! \n"
541 MEMACCESS(2)
542 "vld1.8 {q1}, [%2]! \n"
543 "subs %3, %3, #16 \n"
544 "vmull.u8 q13, d0, d4 \n"
545 "vmull.u8 q14, d1, d4 \n"
546 "vmlal.u8 q13, d2, d5 \n"
547 "vmlal.u8 q14, d3, d5 \n"
548 "vrshrn.u16 d0, q13, #8 \n"
549 "vrshrn.u16 d1, q14, #8 \n"
550 MEMACCESS(0)
551 "vst1.8 {q0}, [%0]! \n"
552 "bgt 1b \n"
553 "b 99f \n"
554
555 // Blend 25 / 75.
556 "25: \n"
557 MEMACCESS(1)
558 "vld1.8 {q0}, [%1]! \n"
559 MEMACCESS(2)
560 "vld1.8 {q1}, [%2]! \n"
561 "subs %3, %3, #16 \n"
562 "vrhadd.u8 q0, q1 \n"
563 "vrhadd.u8 q0, q1 \n"
564 MEMACCESS(0)
565 "vst1.8 {q0}, [%0]! \n"
566 "bgt 25b \n"
567 "b 99f \n"
568
569 // Blend 50 / 50.
570 "50: \n"
571 MEMACCESS(1)
572 "vld1.8 {q0}, [%1]! \n"
573 MEMACCESS(2)
574 "vld1.8 {q1}, [%2]! \n"
575 "subs %3, %3, #16 \n"
576 "vrhadd.u8 q0, q1 \n"
577 MEMACCESS(0)
578 "vst1.8 {q0}, [%0]! \n"
579 "bgt 50b \n"
580 "b 99f \n"
581
582 // Blend 75 / 25.
583 "75: \n"
584 MEMACCESS(1)
585 "vld1.8 {q1}, [%1]! \n"
586 MEMACCESS(2)
587 "vld1.8 {q0}, [%2]! \n"
588 "subs %3, %3, #16 \n"
589 "vrhadd.u8 q0, q1 \n"
590 "vrhadd.u8 q0, q1 \n"
591 MEMACCESS(0)
592 "vst1.8 {q0}, [%0]! \n"
593 "bgt 75b \n"
594 "b 99f \n"
595
596 // Blend 100 / 0 - Copy row unchanged.
597 "100: \n"
598 MEMACCESS(1)
599 "vld1.8 {q0}, [%1]! \n"
600 "subs %3, %3, #16 \n"
601 MEMACCESS(0)
602 "vst1.8 {q0}, [%0]! \n"
603 "bgt 100b \n"
604
605 "99: \n"
606 MEMACCESS(0)
607 "vst1.8 {d1[7]}, [%0] \n"
608 : "+r"(dst_ptr), // %0
609 "+r"(src_ptr), // %1
610 "+r"(src_stride), // %2
611 "+r"(dst_width), // %3
612 "+r"(source_y_fraction) // %4
613 :
614 : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
615 );
616 }
617
ScaleARGBRowDown2_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)618 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
619 uint8* dst, int dst_width) {
620 asm volatile (
621 ".p2align 2 \n"
622 "1: \n"
623 // load even pixels into q0, odd into q1
624 MEMACCESS(0)
625 "vld2.32 {q0, q1}, [%0]! \n"
626 MEMACCESS(0)
627 "vld2.32 {q2, q3}, [%0]! \n"
628 "subs %2, %2, #8 \n" // 8 processed per loop
629 MEMACCESS(1)
630 "vst1.8 {q1}, [%1]! \n" // store odd pixels
631 MEMACCESS(1)
632 "vst1.8 {q3}, [%1]! \n"
633 "bgt 1b \n"
634 : "+r"(src_ptr), // %0
635 "+r"(dst), // %1
636 "+r"(dst_width) // %2
637 :
638 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
639 );
640 }
641
ScaleARGBRowDown2Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)642 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
643 uint8* dst, int dst_width) {
644 asm volatile (
645 // change the stride to row 2 pointer
646 "add %1, %1, %0 \n"
647 ".p2align 2 \n"
648 "1: \n"
649 MEMACCESS(0)
650 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
651 MEMACCESS(0)
652 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
653 "subs %3, %3, #8 \n" // 8 processed per loop.
654 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
655 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
656 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
657 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
658 MEMACCESS(1)
659 "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
660 MEMACCESS(1)
661 "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
662 "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
663 "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
664 "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
665 "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
666 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
667 "vrshrn.u16 d1, q1, #2 \n"
668 "vrshrn.u16 d2, q2, #2 \n"
669 "vrshrn.u16 d3, q3, #2 \n"
670 MEMACCESS(2)
671 "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
672 "bgt 1b \n"
673 : "+r"(src_ptr), // %0
674 "+r"(src_stride), // %1
675 "+r"(dst), // %2
676 "+r"(dst_width) // %3
677 :
678 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
679 );
680 }
681
682 // Reads 4 pixels at a time.
683 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)684 void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
685 int src_stepx, uint8* dst_argb, int dst_width) {
686 asm volatile (
687 "mov r12, %3, lsl #2 \n"
688 ".p2align 2 \n"
689 "1: \n"
690 MEMACCESS(0)
691 "vld1.32 {d0[0]}, [%0], r12 \n"
692 MEMACCESS(0)
693 "vld1.32 {d0[1]}, [%0], r12 \n"
694 MEMACCESS(0)
695 "vld1.32 {d1[0]}, [%0], r12 \n"
696 MEMACCESS(0)
697 "vld1.32 {d1[1]}, [%0], r12 \n"
698 "subs %2, %2, #4 \n" // 4 pixels per loop.
699 MEMACCESS(1)
700 "vst1.8 {q0}, [%1]! \n"
701 "bgt 1b \n"
702 : "+r"(src_argb), // %0
703 "+r"(dst_argb), // %1
704 "+r"(dst_width) // %2
705 : "r"(src_stepx) // %3
706 : "memory", "cc", "r12", "q0"
707 );
708 }
709
710 // Reads 4 pixels at a time.
711 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEvenBox_NEON(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)712 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
713 int src_stepx,
714 uint8* dst_argb, int dst_width) {
715 asm volatile (
716 "mov r12, %4, lsl #2 \n"
717 "add %1, %1, %0 \n"
718 ".p2align 2 \n"
719 "1: \n"
720 MEMACCESS(0)
721 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
722 MEMACCESS(1)
723 "vld1.8 {d1}, [%1], r12 \n"
724 MEMACCESS(0)
725 "vld1.8 {d2}, [%0], r12 \n"
726 MEMACCESS(1)
727 "vld1.8 {d3}, [%1], r12 \n"
728 MEMACCESS(0)
729 "vld1.8 {d4}, [%0], r12 \n"
730 MEMACCESS(1)
731 "vld1.8 {d5}, [%1], r12 \n"
732 MEMACCESS(0)
733 "vld1.8 {d6}, [%0], r12 \n"
734 MEMACCESS(1)
735 "vld1.8 {d7}, [%1], r12 \n"
736 "vaddl.u8 q0, d0, d1 \n"
737 "vaddl.u8 q1, d2, d3 \n"
738 "vaddl.u8 q2, d4, d5 \n"
739 "vaddl.u8 q3, d6, d7 \n"
740 "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
741 "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
742 "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
743 "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
744 "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
745 "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
746 "subs %3, %3, #4 \n" // 4 pixels per loop.
747 MEMACCESS(2)
748 "vst1.8 {q0}, [%2]! \n"
749 "bgt 1b \n"
750 : "+r"(src_argb), // %0
751 "+r"(src_stride), // %1
752 "+r"(dst_argb), // %2
753 "+r"(dst_width) // %3
754 : "r"(src_stepx) // %4
755 : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
756 );
757 }
758
759 #endif // __ARM_NEON__
760
761 #ifdef __cplusplus
762 } // extern "C"
763 } // namespace libyuv
764 #endif
765