1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC Neon.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
20 !defined(__native_client__)
21
22 // NEON downscalers with interpolation.
23 // Provided by Fritz Koenig
24
25 // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)26 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
27 uint8* dst, int dst_width) {
28 asm volatile (
29 ".p2align 2 \n"
30 "1: \n"
31 // load even pixels into q0, odd into q1
32 "vld2.8 {q0, q1}, [%0]! \n"
33 "subs %2, %2, #16 \n" // 16 processed per loop
34 "vst1.8 {q1}, [%1]! \n" // store odd pixels
35 "bgt 1b \n"
36 : "+r"(src_ptr), // %0
37 "+r"(dst), // %1
38 "+r"(dst_width) // %2
39 :
40 : "q0", "q1" // Clobber List
41 );
42 }
43
44 // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)45 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
46 uint8* dst, int dst_width) {
47 asm volatile (
48 // change the stride to row 2 pointer
49 "add %1, %0 \n"
50 ".p2align 2 \n"
51 "1: \n"
52 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
53 "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
54 "subs %3, %3, #16 \n" // 16 processed per loop
55 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
56 "vpaddl.u8 q1, q1 \n"
57 "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
58 "vpadal.u8 q1, q3 \n"
59 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
60 "vrshrn.u16 d1, q1, #2 \n"
61 "vst1.8 {q0}, [%2]! \n"
62 "bgt 1b \n"
63 : "+r"(src_ptr), // %0
64 "+r"(src_stride), // %1
65 "+r"(dst), // %2
66 "+r"(dst_width) // %3
67 :
68 : "q0", "q1", "q2", "q3" // Clobber List
69 );
70 }
71
ScaleRowDown4_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)72 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
73 uint8* dst_ptr, int dst_width) {
74 asm volatile (
75 ".p2align 2 \n"
76 "1: \n"
77 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
78 "subs %2, %2, #8 \n" // 8 processed per loop
79 "vst1.8 {d2}, [%1]! \n"
80 "bgt 1b \n"
81 : "+r"(src_ptr), // %0
82 "+r"(dst_ptr), // %1
83 "+r"(dst_width) // %2
84 :
85 : "q0", "q1", "memory", "cc"
86 );
87 }
88
ScaleRowDown4Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)89 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
90 uint8* dst_ptr, int dst_width) {
91 asm volatile (
92 "add r4, %0, %3 \n"
93 "add r5, r4, %3 \n"
94 "add %3, r5, %3 \n"
95 ".p2align 2 \n"
96 "1: \n"
97 "vld1.8 {q0}, [%0]! \n" // load up 16x4
98 "vld1.8 {q1}, [r4]! \n"
99 "vld1.8 {q2}, [r5]! \n"
100 "vld1.8 {q3}, [%3]! \n"
101 "subs %2, %2, #4 \n"
102 "vpaddl.u8 q0, q0 \n"
103 "vpadal.u8 q0, q1 \n"
104 "vpadal.u8 q0, q2 \n"
105 "vpadal.u8 q0, q3 \n"
106 "vpaddl.u16 q0, q0 \n"
107 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
108 "vmovn.u16 d0, q0 \n"
109 "vst1.32 {d0[0]}, [%1]! \n"
110 "bgt 1b \n"
111 : "+r"(src_ptr), // %0
112 "+r"(dst_ptr), // %1
113 "+r"(dst_width) // %2
114 : "r"(src_stride) // %3
115 : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
116 );
117 }
118
119 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
120 // to load up the every 4th pixel into a 4 different registers.
121 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)122 void ScaleRowDown34_NEON(const uint8* src_ptr,
123 ptrdiff_t src_stride,
124 uint8* dst_ptr, int dst_width) {
125 asm volatile (
126 ".p2align 2 \n"
127 "1: \n"
128 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
129 "subs %2, %2, #24 \n"
130 "vmov d2, d3 \n" // order d0, d1, d2
131 "vst3.8 {d0, d1, d2}, [%1]! \n"
132 "bgt 1b \n"
133 : "+r"(src_ptr), // %0
134 "+r"(dst_ptr), // %1
135 "+r"(dst_width) // %2
136 :
137 : "d0", "d1", "d2", "d3", "memory", "cc"
138 );
139 }
140
ScaleRowDown34_0_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)141 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
142 ptrdiff_t src_stride,
143 uint8* dst_ptr, int dst_width) {
144 asm volatile (
145 "vmov.u8 d24, #3 \n"
146 "add %3, %0 \n"
147 ".p2align 2 \n"
148 "1: \n"
149 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
150 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
151 "subs %2, %2, #24 \n"
152
153 // filter src line 0 with src line 1
154 // expand chars to shorts to allow for room
155 // when adding lines together
156 "vmovl.u8 q8, d4 \n"
157 "vmovl.u8 q9, d5 \n"
158 "vmovl.u8 q10, d6 \n"
159 "vmovl.u8 q11, d7 \n"
160
161 // 3 * line_0 + line_1
162 "vmlal.u8 q8, d0, d24 \n"
163 "vmlal.u8 q9, d1, d24 \n"
164 "vmlal.u8 q10, d2, d24 \n"
165 "vmlal.u8 q11, d3, d24 \n"
166
167 // (3 * line_0 + line_1) >> 2
168 "vqrshrn.u16 d0, q8, #2 \n"
169 "vqrshrn.u16 d1, q9, #2 \n"
170 "vqrshrn.u16 d2, q10, #2 \n"
171 "vqrshrn.u16 d3, q11, #2 \n"
172
173 // a0 = (src[0] * 3 + s[1] * 1) >> 2
174 "vmovl.u8 q8, d1 \n"
175 "vmlal.u8 q8, d0, d24 \n"
176 "vqrshrn.u16 d0, q8, #2 \n"
177
178 // a1 = (src[1] * 1 + s[2] * 1) >> 1
179 "vrhadd.u8 d1, d1, d2 \n"
180
181 // a2 = (src[2] * 1 + s[3] * 3) >> 2
182 "vmovl.u8 q8, d2 \n"
183 "vmlal.u8 q8, d3, d24 \n"
184 "vqrshrn.u16 d2, q8, #2 \n"
185
186 "vst3.8 {d0, d1, d2}, [%1]! \n"
187
188 "bgt 1b \n"
189 : "+r"(src_ptr), // %0
190 "+r"(dst_ptr), // %1
191 "+r"(dst_width), // %2
192 "+r"(src_stride) // %3
193 :
194 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
195 );
196 }
197
ScaleRowDown34_1_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)198 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
199 ptrdiff_t src_stride,
200 uint8* dst_ptr, int dst_width) {
201 asm volatile (
202 "vmov.u8 d24, #3 \n"
203 "add %3, %0 \n"
204 ".p2align 2 \n"
205 "1: \n"
206 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
207 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
208 "subs %2, %2, #24 \n"
209 // average src line 0 with src line 1
210 "vrhadd.u8 q0, q0, q2 \n"
211 "vrhadd.u8 q1, q1, q3 \n"
212
213 // a0 = (src[0] * 3 + s[1] * 1) >> 2
214 "vmovl.u8 q3, d1 \n"
215 "vmlal.u8 q3, d0, d24 \n"
216 "vqrshrn.u16 d0, q3, #2 \n"
217
218 // a1 = (src[1] * 1 + s[2] * 1) >> 1
219 "vrhadd.u8 d1, d1, d2 \n"
220
221 // a2 = (src[2] * 1 + s[3] * 3) >> 2
222 "vmovl.u8 q3, d2 \n"
223 "vmlal.u8 q3, d3, d24 \n"
224 "vqrshrn.u16 d2, q3, #2 \n"
225
226 "vst3.8 {d0, d1, d2}, [%1]! \n"
227 "bgt 1b \n"
228 : "+r"(src_ptr), // %0
229 "+r"(dst_ptr), // %1
230 "+r"(dst_width), // %2
231 "+r"(src_stride) // %3
232 :
233 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
234 );
235 }
236
237 #define HAS_SCALEROWDOWN38_NEON
238 static uvec8 kShuf38 =
239 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
240 static uvec8 kShuf38_2 =
241 { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
242 static vec16 kMult38_Div6 =
243 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
244 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
245 static vec16 kMult38_Div9 =
246 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
247 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
248
249 // 32 -> 12
ScaleRowDown38_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)250 void ScaleRowDown38_NEON(const uint8* src_ptr,
251 ptrdiff_t src_stride,
252 uint8* dst_ptr, int dst_width) {
253 asm volatile (
254 "vld1.8 {q3}, [%3] \n"
255 ".p2align 2 \n"
256 "1: \n"
257 "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
258 "subs %2, %2, #12 \n"
259 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
260 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
261 "vst1.8 {d4}, [%1]! \n"
262 "vst1.32 {d5[0]}, [%1]! \n"
263 "bgt 1b \n"
264 : "+r"(src_ptr), // %0
265 "+r"(dst_ptr), // %1
266 "+r"(dst_width) // %2
267 : "r"(&kShuf38) // %3
268 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
269 );
270 }
271
272 // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)273 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
274 ptrdiff_t src_stride,
275 uint8* dst_ptr, int dst_width) {
276 asm volatile (
277 "vld1.16 {q13}, [%4] \n"
278 "vld1.8 {q14}, [%5] \n"
279 "vld1.8 {q15}, [%6] \n"
280 "add r4, %0, %3, lsl #1 \n"
281 "add %3, %0 \n"
282 ".p2align 2 \n"
283 "1: \n"
284
285 // d0 = 00 40 01 41 02 42 03 43
286 // d1 = 10 50 11 51 12 52 13 53
287 // d2 = 20 60 21 61 22 62 23 63
288 // d3 = 30 70 31 71 32 72 33 73
289 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
290 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
291 "vld4.8 {d16, d17, d18, d19}, [r4]! \n"
292 "subs %2, %2, #12 \n"
293
294 // Shuffle the input data around to get align the data
295 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
296 // d0 = 00 10 01 11 02 12 03 13
297 // d1 = 40 50 41 51 42 52 43 53
298 "vtrn.u8 d0, d1 \n"
299 "vtrn.u8 d4, d5 \n"
300 "vtrn.u8 d16, d17 \n"
301
302 // d2 = 20 30 21 31 22 32 23 33
303 // d3 = 60 70 61 71 62 72 63 73
304 "vtrn.u8 d2, d3 \n"
305 "vtrn.u8 d6, d7 \n"
306 "vtrn.u8 d18, d19 \n"
307
308 // d0 = 00+10 01+11 02+12 03+13
309 // d2 = 40+50 41+51 42+52 43+53
310 "vpaddl.u8 q0, q0 \n"
311 "vpaddl.u8 q2, q2 \n"
312 "vpaddl.u8 q8, q8 \n"
313
314 // d3 = 60+70 61+71 62+72 63+73
315 "vpaddl.u8 d3, d3 \n"
316 "vpaddl.u8 d7, d7 \n"
317 "vpaddl.u8 d19, d19 \n"
318
319 // combine source lines
320 "vadd.u16 q0, q2 \n"
321 "vadd.u16 q0, q8 \n"
322 "vadd.u16 d4, d3, d7 \n"
323 "vadd.u16 d4, d19 \n"
324
325 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
326 // + s[6 + st * 1] + s[7 + st * 1]
327 // + s[6 + st * 2] + s[7 + st * 2]) / 6
328 "vqrdmulh.s16 q2, q2, q13 \n"
329 "vmovn.u16 d4, q2 \n"
330
331 // Shuffle 2,3 reg around so that 2 can be added to the
332 // 0,1 reg and 3 can be added to the 4,5 reg. This
333 // requires expanding from u8 to u16 as the 0,1 and 4,5
334 // registers are already expanded. Then do transposes
335 // to get aligned.
336 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
337 "vmovl.u8 q1, d2 \n"
338 "vmovl.u8 q3, d6 \n"
339 "vmovl.u8 q9, d18 \n"
340
341 // combine source lines
342 "vadd.u16 q1, q3 \n"
343 "vadd.u16 q1, q9 \n"
344
345 // d4 = xx 20 xx 30 xx 22 xx 32
346 // d5 = xx 21 xx 31 xx 23 xx 33
347 "vtrn.u32 d2, d3 \n"
348
349 // d4 = xx 20 xx 21 xx 22 xx 23
350 // d5 = xx 30 xx 31 xx 32 xx 33
351 "vtrn.u16 d2, d3 \n"
352
353 // 0+1+2, 3+4+5
354 "vadd.u16 q0, q1 \n"
355
356 // Need to divide, but can't downshift as the the value
357 // isn't a power of 2. So multiply by 65536 / n
358 // and take the upper 16 bits.
359 "vqrdmulh.s16 q0, q0, q15 \n"
360
361 // Align for table lookup, vtbl requires registers to
362 // be adjacent
363 "vmov.u8 d2, d4 \n"
364
365 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
366 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
367
368 "vst1.8 {d3}, [%1]! \n"
369 "vst1.32 {d4[0]}, [%1]! \n"
370 "bgt 1b \n"
371 : "+r"(src_ptr), // %0
372 "+r"(dst_ptr), // %1
373 "+r"(dst_width), // %2
374 "+r"(src_stride) // %3
375 : "r"(&kMult38_Div6), // %4
376 "r"(&kShuf38_2), // %5
377 "r"(&kMult38_Div9) // %6
378 : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
379 "q13", "q14", "q15", "memory", "cc"
380 );
381 }
382
383 // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)384 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
385 ptrdiff_t src_stride,
386 uint8* dst_ptr, int dst_width) {
387 asm volatile (
388 "vld1.16 {q13}, [%4] \n"
389 "vld1.8 {q14}, [%5] \n"
390 "add %3, %0 \n"
391 ".p2align 2 \n"
392 "1: \n"
393
394 // d0 = 00 40 01 41 02 42 03 43
395 // d1 = 10 50 11 51 12 52 13 53
396 // d2 = 20 60 21 61 22 62 23 63
397 // d3 = 30 70 31 71 32 72 33 73
398 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
399 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
400 "subs %2, %2, #12 \n"
401
402 // Shuffle the input data around to get align the data
403 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
404 // d0 = 00 10 01 11 02 12 03 13
405 // d1 = 40 50 41 51 42 52 43 53
406 "vtrn.u8 d0, d1 \n"
407 "vtrn.u8 d4, d5 \n"
408
409 // d2 = 20 30 21 31 22 32 23 33
410 // d3 = 60 70 61 71 62 72 63 73
411 "vtrn.u8 d2, d3 \n"
412 "vtrn.u8 d6, d7 \n"
413
414 // d0 = 00+10 01+11 02+12 03+13
415 // d2 = 40+50 41+51 42+52 43+53
416 "vpaddl.u8 q0, q0 \n"
417 "vpaddl.u8 q2, q2 \n"
418
419 // d3 = 60+70 61+71 62+72 63+73
420 "vpaddl.u8 d3, d3 \n"
421 "vpaddl.u8 d7, d7 \n"
422
423 // combine source lines
424 "vadd.u16 q0, q2 \n"
425 "vadd.u16 d4, d3, d7 \n"
426
427 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
428 "vqrshrn.u16 d4, q2, #2 \n"
429
430 // Shuffle 2,3 reg around so that 2 can be added to the
431 // 0,1 reg and 3 can be added to the 4,5 reg. This
432 // requires expanding from u8 to u16 as the 0,1 and 4,5
433 // registers are already expanded. Then do transposes
434 // to get aligned.
435 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
436 "vmovl.u8 q1, d2 \n"
437 "vmovl.u8 q3, d6 \n"
438
439 // combine source lines
440 "vadd.u16 q1, q3 \n"
441
442 // d4 = xx 20 xx 30 xx 22 xx 32
443 // d5 = xx 21 xx 31 xx 23 xx 33
444 "vtrn.u32 d2, d3 \n"
445
446 // d4 = xx 20 xx 21 xx 22 xx 23
447 // d5 = xx 30 xx 31 xx 32 xx 33
448 "vtrn.u16 d2, d3 \n"
449
450 // 0+1+2, 3+4+5
451 "vadd.u16 q0, q1 \n"
452
453 // Need to divide, but can't downshift as the the value
454 // isn't a power of 2. So multiply by 65536 / n
455 // and take the upper 16 bits.
456 "vqrdmulh.s16 q0, q0, q13 \n"
457
458 // Align for table lookup, vtbl requires registers to
459 // be adjacent
460 "vmov.u8 d2, d4 \n"
461
462 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
463 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
464
465 "vst1.8 {d3}, [%1]! \n"
466 "vst1.32 {d4[0]}, [%1]! \n"
467 "bgt 1b \n"
468 : "+r"(src_ptr), // %0
469 "+r"(dst_ptr), // %1
470 "+r"(dst_width), // %2
471 "+r"(src_stride) // %3
472 : "r"(&kMult38_Div6), // %4
473 "r"(&kShuf38_2) // %5
474 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
475 );
476 }
477
478 // 16x2 -> 16x1
ScaleFilterRows_NEON(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)479 void ScaleFilterRows_NEON(uint8* dst_ptr,
480 const uint8* src_ptr, ptrdiff_t src_stride,
481 int dst_width, int source_y_fraction) {
482 asm volatile (
483 "cmp %4, #0 \n"
484 "beq 100f \n"
485 "add %2, %1 \n"
486 "cmp %4, #64 \n"
487 "beq 75f \n"
488 "cmp %4, #128 \n"
489 "beq 50f \n"
490 "cmp %4, #192 \n"
491 "beq 25f \n"
492
493 "vdup.8 d5, %4 \n"
494 "rsb %4, #256 \n"
495 "vdup.8 d4, %4 \n"
496 // General purpose row blend.
497 "1: \n"
498 "vld1.8 {q0}, [%1]! \n"
499 "vld1.8 {q1}, [%2]! \n"
500 "subs %3, %3, #16 \n"
501 "vmull.u8 q13, d0, d4 \n"
502 "vmull.u8 q14, d1, d4 \n"
503 "vmlal.u8 q13, d2, d5 \n"
504 "vmlal.u8 q14, d3, d5 \n"
505 "vrshrn.u16 d0, q13, #8 \n"
506 "vrshrn.u16 d1, q14, #8 \n"
507 "vst1.8 {q0}, [%0]! \n"
508 "bgt 1b \n"
509 "b 99f \n"
510
511 // Blend 25 / 75.
512 "25: \n"
513 "vld1.8 {q0}, [%1]! \n"
514 "vld1.8 {q1}, [%2]! \n"
515 "subs %3, %3, #16 \n"
516 "vrhadd.u8 q0, q1 \n"
517 "vrhadd.u8 q0, q1 \n"
518 "vst1.8 {q0}, [%0]! \n"
519 "bgt 25b \n"
520 "b 99f \n"
521
522 // Blend 50 / 50.
523 "50: \n"
524 "vld1.8 {q0}, [%1]! \n"
525 "vld1.8 {q1}, [%2]! \n"
526 "subs %3, %3, #16 \n"
527 "vrhadd.u8 q0, q1 \n"
528 "vst1.8 {q0}, [%0]! \n"
529 "bgt 50b \n"
530 "b 99f \n"
531
532 // Blend 75 / 25.
533 "75: \n"
534 "vld1.8 {q1}, [%1]! \n"
535 "vld1.8 {q0}, [%2]! \n"
536 "subs %3, %3, #16 \n"
537 "vrhadd.u8 q0, q1 \n"
538 "vrhadd.u8 q0, q1 \n"
539 "vst1.8 {q0}, [%0]! \n"
540 "bgt 75b \n"
541 "b 99f \n"
542
543 // Blend 100 / 0 - Copy row unchanged.
544 "100: \n"
545 "vld1.8 {q0}, [%1]! \n"
546 "subs %3, %3, #16 \n"
547 "vst1.8 {q0}, [%0]! \n"
548 "bgt 100b \n"
549
550 "99: \n"
551 "vst1.8 {d1[7]}, [%0] \n"
552 : "+r"(dst_ptr), // %0
553 "+r"(src_ptr), // %1
554 "+r"(src_stride), // %2
555 "+r"(dst_width), // %3
556 "+r"(source_y_fraction) // %4
557 :
558 : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
559 );
560 }
561
ScaleARGBRowDown2_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)562 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
563 uint8* dst, int dst_width) {
564 asm volatile (
565 ".p2align 2 \n"
566 "1: \n"
567 // load even pixels into q0, odd into q1
568 "vld2.32 {q0, q1}, [%0]! \n"
569 "vld2.32 {q2, q3}, [%0]! \n"
570 "subs %2, %2, #8 \n" // 8 processed per loop
571 "vst1.8 {q1}, [%1]! \n" // store odd pixels
572 "vst1.8 {q3}, [%1]! \n"
573 "bgt 1b \n"
574 : "+r"(src_ptr), // %0
575 "+r"(dst), // %1
576 "+r"(dst_width) // %2
577 :
578 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
579 );
580 }
581
ScaleARGBRowDown2Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)582 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
583 uint8* dst, int dst_width) {
584 asm volatile (
585 // change the stride to row 2 pointer
586 "add %1, %1, %0 \n"
587 ".p2align 2 \n"
588 "1: \n"
589 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
590 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
591 "subs %3, %3, #8 \n" // 8 processed per loop.
592 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
593 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
594 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
595 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
596 "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
597 "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
598 "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
599 "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
600 "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
601 "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
602 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
603 "vrshrn.u16 d1, q1, #2 \n"
604 "vrshrn.u16 d2, q2, #2 \n"
605 "vrshrn.u16 d3, q3, #2 \n"
606 "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
607 "bgt 1b \n"
608 : "+r"(src_ptr), // %0
609 "+r"(src_stride), // %1
610 "+r"(dst), // %2
611 "+r"(dst_width) // %3
612 :
613 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
614 );
615 }
616
617 // Reads 4 pixels at a time.
618 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)619 void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
620 int src_stepx, uint8* dst_argb, int dst_width) {
621 asm volatile (
622 "mov r12, %3, lsl #2 \n"
623 ".p2align 2 \n"
624 "1: \n"
625 "vld1.32 {d0[0]}, [%0], r12 \n"
626 "vld1.32 {d0[1]}, [%0], r12 \n"
627 "vld1.32 {d1[0]}, [%0], r12 \n"
628 "vld1.32 {d1[1]}, [%0], r12 \n"
629 "subs %2, %2, #4 \n" // 4 pixels per loop.
630 "vst1.8 {q0}, [%1]! \n"
631 "bgt 1b \n"
632 : "+r"(src_argb), // %0
633 "+r"(dst_argb), // %1
634 "+r"(dst_width) // %2
635 : "r"(src_stepx) // %3
636 : "memory", "cc", "r12", "q0"
637 );
638 }
639
640 // Reads 4 pixels at a time.
641 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEvenBox_NEON(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)642 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
643 int src_stepx,
644 uint8* dst_argb, int dst_width) {
645 asm volatile (
646 "mov r12, %4, lsl #2 \n"
647 "add %1, %1, %0 \n"
648 ".p2align 2 \n"
649 "1: \n"
650 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
651 "vld1.8 {d1}, [%1], r12 \n"
652 "vld1.8 {d2}, [%0], r12 \n"
653 "vld1.8 {d3}, [%1], r12 \n"
654 "vld1.8 {d4}, [%0], r12 \n"
655 "vld1.8 {d5}, [%1], r12 \n"
656 "vld1.8 {d6}, [%0], r12 \n"
657 "vld1.8 {d7}, [%1], r12 \n"
658 "vaddl.u8 q0, d0, d1 \n"
659 "vaddl.u8 q1, d2, d3 \n"
660 "vaddl.u8 q2, d4, d5 \n"
661 "vaddl.u8 q3, d6, d7 \n"
662 "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
663 "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
664 "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
665 "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
666 "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
667 "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
668 "subs %3, %3, #4 \n" // 4 pixels per loop.
669 "vst1.8 {q0}, [%2]! \n"
670 "bgt 1b \n"
671 : "+r"(src_argb), // %0
672 "+r"(src_stride), // %1
673 "+r"(dst_argb), // %2
674 "+r"(dst_width) // %3
675 : "r"(src_stepx) // %4
676 : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
677 );
678 }
679
680 #endif // __ARM_NEON__
681
682 #ifdef __cplusplus
683 } // extern "C"
684 } // namespace libyuv
685 #endif
686