1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/scale.h"
13 #include "libyuv/scale_row.h"
14
15 #ifdef __cplusplus
16 namespace libyuv {
17 extern "C" {
18 #endif
19
20 // This module is for GCC Neon armv8 64 bit.
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
22
23 // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)24 void ScaleRowDown2_NEON(const uint8_t* src_ptr,
25 ptrdiff_t src_stride,
26 uint8_t* dst,
27 int dst_width) {
28 (void)src_stride;
29 asm volatile(
30 "1: \n"
31 // load even pixels into v0, odd into v1
32 "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
33 "subs %w2, %w2, #16 \n" // 16 processed per loop
34 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
35 "b.gt 1b \n"
36 : "+r"(src_ptr), // %0
37 "+r"(dst), // %1
38 "+r"(dst_width) // %2
39 :
40 : "v0", "v1" // Clobber List
41 );
42 }
43
44 // Read 32x1 average down and write 16x1.
ScaleRowDown2Linear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)45 void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
46 ptrdiff_t src_stride,
47 uint8_t* dst,
48 int dst_width) {
49 (void)src_stride;
50 asm volatile(
51 "1: \n"
52 // load even pixels into v0, odd into v1
53 "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
54 "subs %w2, %w2, #16 \n" // 16 processed per loop
55 "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
56 "st1 {v0.16b}, [%1], #16 \n"
57 "b.gt 1b \n"
58 : "+r"(src_ptr), // %0
59 "+r"(dst), // %1
60 "+r"(dst_width) // %2
61 :
62 : "v0", "v1" // Clobber List
63 );
64 }
65
66 // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)67 void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
68 ptrdiff_t src_stride,
69 uint8_t* dst,
70 int dst_width) {
71 asm volatile(
72 // change the stride to row 2 pointer
73 "add %1, %1, %0 \n"
74 "1: \n"
75 "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
76 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
77 "subs %w3, %w3, #16 \n" // 16 processed per loop
78 "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
79 "uaddlp v1.8h, v1.16b \n"
80 "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
81 "uadalp v1.8h, v3.16b \n"
82 "rshrn v0.8b, v0.8h, #2 \n" // round and pack
83 "rshrn2 v0.16b, v1.8h, #2 \n"
84 "st1 {v0.16b}, [%2], #16 \n"
85 "b.gt 1b \n"
86 : "+r"(src_ptr), // %0
87 "+r"(src_stride), // %1
88 "+r"(dst), // %2
89 "+r"(dst_width) // %3
90 :
91 : "v0", "v1", "v2", "v3" // Clobber List
92 );
93 }
94
ScaleRowDown4_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)95 void ScaleRowDown4_NEON(const uint8_t* src_ptr,
96 ptrdiff_t src_stride,
97 uint8_t* dst_ptr,
98 int dst_width) {
99 (void)src_stride;
100 asm volatile(
101 "1: \n"
102 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
103 "subs %w2, %w2, #8 \n" // 8 processed per loop
104 "st1 {v2.8b}, [%1], #8 \n"
105 "b.gt 1b \n"
106 : "+r"(src_ptr), // %0
107 "+r"(dst_ptr), // %1
108 "+r"(dst_width) // %2
109 :
110 : "v0", "v1", "v2", "v3", "memory", "cc");
111 }
112
ScaleRowDown4Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)113 void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
114 ptrdiff_t src_stride,
115 uint8_t* dst_ptr,
116 int dst_width) {
117 const uint8_t* src_ptr1 = src_ptr + src_stride;
118 const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
119 const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
120 asm volatile(
121 "1: \n"
122 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
123 "ld1 {v1.16b}, [%2], #16 \n"
124 "ld1 {v2.16b}, [%3], #16 \n"
125 "ld1 {v3.16b}, [%4], #16 \n"
126 "subs %w5, %w5, #4 \n"
127 "uaddlp v0.8h, v0.16b \n"
128 "uadalp v0.8h, v1.16b \n"
129 "uadalp v0.8h, v2.16b \n"
130 "uadalp v0.8h, v3.16b \n"
131 "addp v0.8h, v0.8h, v0.8h \n"
132 "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
133 "st1 {v0.s}[0], [%1], #4 \n"
134 "b.gt 1b \n"
135 : "+r"(src_ptr), // %0
136 "+r"(dst_ptr), // %1
137 "+r"(src_ptr1), // %2
138 "+r"(src_ptr2), // %3
139 "+r"(src_ptr3), // %4
140 "+r"(dst_width) // %5
141 :
142 : "v0", "v1", "v2", "v3", "memory", "cc");
143 }
144
145 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
146 // to load up the every 4th pixel into a 4 different registers.
147 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)148 void ScaleRowDown34_NEON(const uint8_t* src_ptr,
149 ptrdiff_t src_stride,
150 uint8_t* dst_ptr,
151 int dst_width) {
152 (void)src_stride;
153 asm volatile(
154 "1: \n"
155 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
156 "subs %w2, %w2, #24 \n"
157 "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
158 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
159 "b.gt 1b \n"
160 : "+r"(src_ptr), // %0
161 "+r"(dst_ptr), // %1
162 "+r"(dst_width) // %2
163 :
164 : "v0", "v1", "v2", "v3", "memory", "cc");
165 }
166
ScaleRowDown34_0_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)167 void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
168 ptrdiff_t src_stride,
169 uint8_t* dst_ptr,
170 int dst_width) {
171 asm volatile(
172 "movi v20.8b, #3 \n"
173 "add %3, %3, %0 \n"
174 "1: \n"
175 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
176 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
177 "subs %w2, %w2, #24 \n"
178
179 // filter src line 0 with src line 1
180 // expand chars to shorts to allow for room
181 // when adding lines together
182 "ushll v16.8h, v4.8b, #0 \n"
183 "ushll v17.8h, v5.8b, #0 \n"
184 "ushll v18.8h, v6.8b, #0 \n"
185 "ushll v19.8h, v7.8b, #0 \n"
186
187 // 3 * line_0 + line_1
188 "umlal v16.8h, v0.8b, v20.8b \n"
189 "umlal v17.8h, v1.8b, v20.8b \n"
190 "umlal v18.8h, v2.8b, v20.8b \n"
191 "umlal v19.8h, v3.8b, v20.8b \n"
192
193 // (3 * line_0 + line_1) >> 2
194 "uqrshrn v0.8b, v16.8h, #2 \n"
195 "uqrshrn v1.8b, v17.8h, #2 \n"
196 "uqrshrn v2.8b, v18.8h, #2 \n"
197 "uqrshrn v3.8b, v19.8h, #2 \n"
198
199 // a0 = (src[0] * 3 + s[1] * 1) >> 2
200 "ushll v16.8h, v1.8b, #0 \n"
201 "umlal v16.8h, v0.8b, v20.8b \n"
202 "uqrshrn v0.8b, v16.8h, #2 \n"
203
204 // a1 = (src[1] * 1 + s[2] * 1) >> 1
205 "urhadd v1.8b, v1.8b, v2.8b \n"
206
207 // a2 = (src[2] * 1 + s[3] * 3) >> 2
208 "ushll v16.8h, v2.8b, #0 \n"
209 "umlal v16.8h, v3.8b, v20.8b \n"
210 "uqrshrn v2.8b, v16.8h, #2 \n"
211
212 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
213
214 "b.gt 1b \n"
215 : "+r"(src_ptr), // %0
216 "+r"(dst_ptr), // %1
217 "+r"(dst_width), // %2
218 "+r"(src_stride) // %3
219 :
220 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
221 "v19", "v20", "memory", "cc");
222 }
223
ScaleRowDown34_1_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)224 void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
225 ptrdiff_t src_stride,
226 uint8_t* dst_ptr,
227 int dst_width) {
228 asm volatile(
229 "movi v20.8b, #3 \n"
230 "add %3, %3, %0 \n"
231 "1: \n"
232 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
233 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
234 "subs %w2, %w2, #24 \n"
235 // average src line 0 with src line 1
236 "urhadd v0.8b, v0.8b, v4.8b \n"
237 "urhadd v1.8b, v1.8b, v5.8b \n"
238 "urhadd v2.8b, v2.8b, v6.8b \n"
239 "urhadd v3.8b, v3.8b, v7.8b \n"
240
241 // a0 = (src[0] * 3 + s[1] * 1) >> 2
242 "ushll v4.8h, v1.8b, #0 \n"
243 "umlal v4.8h, v0.8b, v20.8b \n"
244 "uqrshrn v0.8b, v4.8h, #2 \n"
245
246 // a1 = (src[1] * 1 + s[2] * 1) >> 1
247 "urhadd v1.8b, v1.8b, v2.8b \n"
248
249 // a2 = (src[2] * 1 + s[3] * 3) >> 2
250 "ushll v4.8h, v2.8b, #0 \n"
251 "umlal v4.8h, v3.8b, v20.8b \n"
252 "uqrshrn v2.8b, v4.8h, #2 \n"
253
254 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
255 "b.gt 1b \n"
256 : "+r"(src_ptr), // %0
257 "+r"(dst_ptr), // %1
258 "+r"(dst_width), // %2
259 "+r"(src_stride) // %3
260 :
261 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
262 }
263
264 static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
265 22, 24, 27, 30, 0, 0, 0, 0};
266 static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20,
267 34, 6, 22, 35, 0, 0, 0, 0};
268 static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
269 65536 / 12, 65536 / 12, 65536 / 12,
270 65536 / 12, 65536 / 12};
271 static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
272 65536 / 18, 65536 / 18, 65536 / 18,
273 65536 / 18, 65536 / 18};
274
275 // 32 -> 12
ScaleRowDown38_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)276 void ScaleRowDown38_NEON(const uint8_t* src_ptr,
277 ptrdiff_t src_stride,
278 uint8_t* dst_ptr,
279 int dst_width) {
280 (void)src_stride;
281 asm volatile(
282 "ld1 {v3.16b}, [%3] \n"
283 "1: \n"
284 "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
285 "subs %w2, %w2, #12 \n"
286 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
287 "st1 {v2.8b}, [%1], #8 \n"
288 "st1 {v2.s}[2], [%1], #4 \n"
289 "b.gt 1b \n"
290 : "+r"(src_ptr), // %0
291 "+r"(dst_ptr), // %1
292 "+r"(dst_width) // %2
293 : "r"(&kShuf38) // %3
294 : "v0", "v1", "v2", "v3", "memory", "cc");
295 }
296
297 // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)298 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
299 ptrdiff_t src_stride,
300 uint8_t* dst_ptr,
301 int dst_width) {
302 const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
303 ptrdiff_t tmp_src_stride = src_stride;
304
305 asm volatile(
306 "ld1 {v29.8h}, [%5] \n"
307 "ld1 {v30.16b}, [%6] \n"
308 "ld1 {v31.8h}, [%7] \n"
309 "add %2, %2, %0 \n"
310 "1: \n"
311
312 // 00 40 01 41 02 42 03 43
313 // 10 50 11 51 12 52 13 53
314 // 20 60 21 61 22 62 23 63
315 // 30 70 31 71 32 72 33 73
316 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
317 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
318 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
319 "subs %w4, %w4, #12 \n"
320
321 // Shuffle the input data around to get align the data
322 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
323 // 00 10 01 11 02 12 03 13
324 // 40 50 41 51 42 52 43 53
325 "trn1 v20.8b, v0.8b, v1.8b \n"
326 "trn2 v21.8b, v0.8b, v1.8b \n"
327 "trn1 v22.8b, v4.8b, v5.8b \n"
328 "trn2 v23.8b, v4.8b, v5.8b \n"
329 "trn1 v24.8b, v16.8b, v17.8b \n"
330 "trn2 v25.8b, v16.8b, v17.8b \n"
331
332 // 20 30 21 31 22 32 23 33
333 // 60 70 61 71 62 72 63 73
334 "trn1 v0.8b, v2.8b, v3.8b \n"
335 "trn2 v1.8b, v2.8b, v3.8b \n"
336 "trn1 v4.8b, v6.8b, v7.8b \n"
337 "trn2 v5.8b, v6.8b, v7.8b \n"
338 "trn1 v16.8b, v18.8b, v19.8b \n"
339 "trn2 v17.8b, v18.8b, v19.8b \n"
340
341 // 00+10 01+11 02+12 03+13
342 // 40+50 41+51 42+52 43+53
343 "uaddlp v20.4h, v20.8b \n"
344 "uaddlp v21.4h, v21.8b \n"
345 "uaddlp v22.4h, v22.8b \n"
346 "uaddlp v23.4h, v23.8b \n"
347 "uaddlp v24.4h, v24.8b \n"
348 "uaddlp v25.4h, v25.8b \n"
349
350 // 60+70 61+71 62+72 63+73
351 "uaddlp v1.4h, v1.8b \n"
352 "uaddlp v5.4h, v5.8b \n"
353 "uaddlp v17.4h, v17.8b \n"
354
355 // combine source lines
356 "add v20.4h, v20.4h, v22.4h \n"
357 "add v21.4h, v21.4h, v23.4h \n"
358 "add v20.4h, v20.4h, v24.4h \n"
359 "add v21.4h, v21.4h, v25.4h \n"
360 "add v2.4h, v1.4h, v5.4h \n"
361 "add v2.4h, v2.4h, v17.4h \n"
362
363 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
364 // + s[6 + st * 1] + s[7 + st * 1]
365 // + s[6 + st * 2] + s[7 + st * 2]) / 6
366 "sqrdmulh v2.8h, v2.8h, v29.8h \n"
367 "xtn v2.8b, v2.8h \n"
368
369 // Shuffle 2,3 reg around so that 2 can be added to the
370 // 0,1 reg and 3 can be added to the 4,5 reg. This
371 // requires expanding from u8 to u16 as the 0,1 and 4,5
372 // registers are already expanded. Then do transposes
373 // to get aligned.
374 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
375 "ushll v16.8h, v16.8b, #0 \n"
376 "uaddl v0.8h, v0.8b, v4.8b \n"
377
378 // combine source lines
379 "add v0.8h, v0.8h, v16.8h \n"
380
381 // xx 20 xx 21 xx 22 xx 23
382 // xx 30 xx 31 xx 32 xx 33
383 "trn1 v1.8h, v0.8h, v0.8h \n"
384 "trn2 v4.8h, v0.8h, v0.8h \n"
385 "xtn v0.4h, v1.4s \n"
386 "xtn v4.4h, v4.4s \n"
387
388 // 0+1+2, 3+4+5
389 "add v20.8h, v20.8h, v0.8h \n"
390 "add v21.8h, v21.8h, v4.8h \n"
391
392 // Need to divide, but can't downshift as the the value
393 // isn't a power of 2. So multiply by 65536 / n
394 // and take the upper 16 bits.
395 "sqrdmulh v0.8h, v20.8h, v31.8h \n"
396 "sqrdmulh v1.8h, v21.8h, v31.8h \n"
397
398 // Align for table lookup, vtbl requires registers to be adjacent
399 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
400
401 "st1 {v3.8b}, [%1], #8 \n"
402 "st1 {v3.s}[2], [%1], #4 \n"
403 "b.gt 1b \n"
404 : "+r"(src_ptr), // %0
405 "+r"(dst_ptr), // %1
406 "+r"(tmp_src_stride), // %2
407 "+r"(src_ptr1), // %3
408 "+r"(dst_width) // %4
409 : "r"(&kMult38_Div6), // %5
410 "r"(&kShuf38_2), // %6
411 "r"(&kMult38_Div9) // %7
412 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
413 "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
414 "memory", "cc");
415 }
416
417 // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)418 void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
419 ptrdiff_t src_stride,
420 uint8_t* dst_ptr,
421 int dst_width) {
422 // TODO(fbarchard): use src_stride directly for clang 3.5+.
423 ptrdiff_t tmp_src_stride = src_stride;
424 asm volatile(
425 "ld1 {v30.8h}, [%4] \n"
426 "ld1 {v31.16b}, [%5] \n"
427 "add %2, %2, %0 \n"
428 "1: \n"
429
430 // 00 40 01 41 02 42 03 43
431 // 10 50 11 51 12 52 13 53
432 // 20 60 21 61 22 62 23 63
433 // 30 70 31 71 32 72 33 73
434 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
435 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
436 "subs %w3, %w3, #12 \n"
437
438 // Shuffle the input data around to get align the data
439 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
440 // 00 10 01 11 02 12 03 13
441 // 40 50 41 51 42 52 43 53
442 "trn1 v16.8b, v0.8b, v1.8b \n"
443 "trn2 v17.8b, v0.8b, v1.8b \n"
444 "trn1 v18.8b, v4.8b, v5.8b \n"
445 "trn2 v19.8b, v4.8b, v5.8b \n"
446
447 // 20 30 21 31 22 32 23 33
448 // 60 70 61 71 62 72 63 73
449 "trn1 v0.8b, v2.8b, v3.8b \n"
450 "trn2 v1.8b, v2.8b, v3.8b \n"
451 "trn1 v4.8b, v6.8b, v7.8b \n"
452 "trn2 v5.8b, v6.8b, v7.8b \n"
453
454 // 00+10 01+11 02+12 03+13
455 // 40+50 41+51 42+52 43+53
456 "uaddlp v16.4h, v16.8b \n"
457 "uaddlp v17.4h, v17.8b \n"
458 "uaddlp v18.4h, v18.8b \n"
459 "uaddlp v19.4h, v19.8b \n"
460
461 // 60+70 61+71 62+72 63+73
462 "uaddlp v1.4h, v1.8b \n"
463 "uaddlp v5.4h, v5.8b \n"
464
465 // combine source lines
466 "add v16.4h, v16.4h, v18.4h \n"
467 "add v17.4h, v17.4h, v19.4h \n"
468 "add v2.4h, v1.4h, v5.4h \n"
469
470 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
471 "uqrshrn v2.8b, v2.8h, #2 \n"
472
473 // Shuffle 2,3 reg around so that 2 can be added to the
474 // 0,1 reg and 3 can be added to the 4,5 reg. This
475 // requires expanding from u8 to u16 as the 0,1 and 4,5
476 // registers are already expanded. Then do transposes
477 // to get aligned.
478 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
479
480 // combine source lines
481 "uaddl v0.8h, v0.8b, v4.8b \n"
482
483 // xx 20 xx 21 xx 22 xx 23
484 // xx 30 xx 31 xx 32 xx 33
485 "trn1 v1.8h, v0.8h, v0.8h \n"
486 "trn2 v4.8h, v0.8h, v0.8h \n"
487 "xtn v0.4h, v1.4s \n"
488 "xtn v4.4h, v4.4s \n"
489
490 // 0+1+2, 3+4+5
491 "add v16.8h, v16.8h, v0.8h \n"
492 "add v17.8h, v17.8h, v4.8h \n"
493
494 // Need to divide, but can't downshift as the the value
495 // isn't a power of 2. So multiply by 65536 / n
496 // and take the upper 16 bits.
497 "sqrdmulh v0.8h, v16.8h, v30.8h \n"
498 "sqrdmulh v1.8h, v17.8h, v30.8h \n"
499
500 // Align for table lookup, vtbl requires registers to
501 // be adjacent
502
503 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
504
505 "st1 {v3.8b}, [%1], #8 \n"
506 "st1 {v3.s}[2], [%1], #4 \n"
507 "b.gt 1b \n"
508 : "+r"(src_ptr), // %0
509 "+r"(dst_ptr), // %1
510 "+r"(tmp_src_stride), // %2
511 "+r"(dst_width) // %3
512 : "r"(&kMult38_Div6), // %4
513 "r"(&kShuf38_2) // %5
514 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
515 "v19", "v30", "v31", "memory", "cc");
516 }
517
518 // Add a row of bytes to a row of shorts. Used for box filter.
519 // Reads 16 bytes and accumulates to 16 shorts at a time.
ScaleAddRow_NEON(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)520 void ScaleAddRow_NEON(const uint8_t* src_ptr,
521 uint16_t* dst_ptr,
522 int src_width) {
523 asm volatile(
524 "1: \n"
525 "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
526 "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
527 "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
528 "uaddw v1.8h, v1.8h, v0.8b \n"
529 "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
530 "subs %w2, %w2, #16 \n" // 16 processed per loop
531 "b.gt 1b \n"
532 : "+r"(src_ptr), // %0
533 "+r"(dst_ptr), // %1
534 "+r"(src_width) // %2
535 :
536 : "memory", "cc", "v0", "v1", "v2" // Clobber List
537 );
538 }
539
540 // TODO(Yang Zhang): Investigate less load instructions for
541 // the x/dx stepping
542 #define LOAD2_DATA8_LANE(n) \
543 "lsr %5, %3, #16 \n" \
544 "add %6, %1, %5 \n" \
545 "add %3, %3, %4 \n" \
546 "ld2 {v4.b, v5.b}[" #n "], [%6] \n"
547
548 // The NEON version mimics this formula (from row_common.cc):
549 // #define BLENDER(a, b, f) (uint8_t)((int)(a) +
550 // ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
551
ScaleFilterCols_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)552 void ScaleFilterCols_NEON(uint8_t* dst_ptr,
553 const uint8_t* src_ptr,
554 int dst_width,
555 int x,
556 int dx) {
557 int dx_offset[4] = {0, 1, 2, 3};
558 int* tmp = dx_offset;
559 const uint8_t* src_tmp = src_ptr;
560 int64_t x64 = (int64_t)x; // NOLINT
561 int64_t dx64 = (int64_t)dx; // NOLINT
562 asm volatile (
563 "dup v0.4s, %w3 \n" // x
564 "dup v1.4s, %w4 \n" // dx
565 "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
566 "shl v3.4s, v1.4s, #2 \n" // 4 * dx
567 "mul v1.4s, v1.4s, v2.4s \n"
568 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
569 "add v1.4s, v1.4s, v0.4s \n"
570 // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
571 "add v2.4s, v1.4s, v3.4s \n"
572 "shl v0.4s, v3.4s, #1 \n" // 8 * dx
573 "1: \n"
574 LOAD2_DATA8_LANE(0)
575 LOAD2_DATA8_LANE(1)
576 LOAD2_DATA8_LANE(2)
577 LOAD2_DATA8_LANE(3)
578 LOAD2_DATA8_LANE(4)
579 LOAD2_DATA8_LANE(5)
580 LOAD2_DATA8_LANE(6)
581 LOAD2_DATA8_LANE(7)
582 "mov v6.16b, v1.16b \n"
583 "mov v7.16b, v2.16b \n"
584 "uzp1 v6.8h, v6.8h, v7.8h \n"
585 "ushll v4.8h, v4.8b, #0 \n"
586 "ushll v5.8h, v5.8b, #0 \n"
587 "ssubl v16.4s, v5.4h, v4.4h \n"
588 "ssubl2 v17.4s, v5.8h, v4.8h \n"
589 "ushll v7.4s, v6.4h, #0 \n"
590 "ushll2 v6.4s, v6.8h, #0 \n"
591 "mul v16.4s, v16.4s, v7.4s \n"
592 "mul v17.4s, v17.4s, v6.4s \n"
593 "rshrn v6.4h, v16.4s, #16 \n"
594 "rshrn2 v6.8h, v17.4s, #16 \n"
595 "add v4.8h, v4.8h, v6.8h \n"
596 "xtn v4.8b, v4.8h \n"
597
598 "st1 {v4.8b}, [%0], #8 \n" // store pixels
599 "add v1.4s, v1.4s, v0.4s \n"
600 "add v2.4s, v2.4s, v0.4s \n"
601 "subs %w2, %w2, #8 \n" // 8 processed per loop
602 "b.gt 1b \n"
603 : "+r"(dst_ptr), // %0
604 "+r"(src_ptr), // %1
605 "+r"(dst_width), // %2
606 "+r"(x64), // %3
607 "+r"(dx64), // %4
608 "+r"(tmp), // %5
609 "+r"(src_tmp) // %6
610 :
611 : "memory", "cc", "v0", "v1", "v2", "v3",
612 "v4", "v5", "v6", "v7", "v16", "v17"
613 );
614 }
615
616 #undef LOAD2_DATA8_LANE
617
618 // 16x2 -> 16x1
ScaleFilterRows_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)619 void ScaleFilterRows_NEON(uint8_t* dst_ptr,
620 const uint8_t* src_ptr,
621 ptrdiff_t src_stride,
622 int dst_width,
623 int source_y_fraction) {
624 int y_fraction = 256 - source_y_fraction;
625 asm volatile(
626 "cmp %w4, #0 \n"
627 "b.eq 100f \n"
628 "add %2, %2, %1 \n"
629 "cmp %w4, #64 \n"
630 "b.eq 75f \n"
631 "cmp %w4, #128 \n"
632 "b.eq 50f \n"
633 "cmp %w4, #192 \n"
634 "b.eq 25f \n"
635
636 "dup v5.8b, %w4 \n"
637 "dup v4.8b, %w5 \n"
638 // General purpose row blend.
639 "1: \n"
640 "ld1 {v0.16b}, [%1], #16 \n"
641 "ld1 {v1.16b}, [%2], #16 \n"
642 "subs %w3, %w3, #16 \n"
643 "umull v6.8h, v0.8b, v4.8b \n"
644 "umull2 v7.8h, v0.16b, v4.16b \n"
645 "umlal v6.8h, v1.8b, v5.8b \n"
646 "umlal2 v7.8h, v1.16b, v5.16b \n"
647 "rshrn v0.8b, v6.8h, #8 \n"
648 "rshrn2 v0.16b, v7.8h, #8 \n"
649 "st1 {v0.16b}, [%0], #16 \n"
650 "b.gt 1b \n"
651 "b 99f \n"
652
653 // Blend 25 / 75.
654 "25: \n"
655 "ld1 {v0.16b}, [%1], #16 \n"
656 "ld1 {v1.16b}, [%2], #16 \n"
657 "subs %w3, %w3, #16 \n"
658 "urhadd v0.16b, v0.16b, v1.16b \n"
659 "urhadd v0.16b, v0.16b, v1.16b \n"
660 "st1 {v0.16b}, [%0], #16 \n"
661 "b.gt 25b \n"
662 "b 99f \n"
663
664 // Blend 50 / 50.
665 "50: \n"
666 "ld1 {v0.16b}, [%1], #16 \n"
667 "ld1 {v1.16b}, [%2], #16 \n"
668 "subs %w3, %w3, #16 \n"
669 "urhadd v0.16b, v0.16b, v1.16b \n"
670 "st1 {v0.16b}, [%0], #16 \n"
671 "b.gt 50b \n"
672 "b 99f \n"
673
674 // Blend 75 / 25.
675 "75: \n"
676 "ld1 {v1.16b}, [%1], #16 \n"
677 "ld1 {v0.16b}, [%2], #16 \n"
678 "subs %w3, %w3, #16 \n"
679 "urhadd v0.16b, v0.16b, v1.16b \n"
680 "urhadd v0.16b, v0.16b, v1.16b \n"
681 "st1 {v0.16b}, [%0], #16 \n"
682 "b.gt 75b \n"
683 "b 99f \n"
684
685 // Blend 100 / 0 - Copy row unchanged.
686 "100: \n"
687 "ld1 {v0.16b}, [%1], #16 \n"
688 "subs %w3, %w3, #16 \n"
689 "st1 {v0.16b}, [%0], #16 \n"
690 "b.gt 100b \n"
691
692 "99: \n"
693 "st1 {v0.b}[15], [%0] \n"
694 : "+r"(dst_ptr), // %0
695 "+r"(src_ptr), // %1
696 "+r"(src_stride), // %2
697 "+r"(dst_width), // %3
698 "+r"(source_y_fraction), // %4
699 "+r"(y_fraction) // %5
700 :
701 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
702 }
703
ScaleARGBRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)704 void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
705 ptrdiff_t src_stride,
706 uint8_t* dst,
707 int dst_width) {
708 (void)src_stride;
709 asm volatile(
710 "1: \n"
711 // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
712 "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
713 "subs %w2, %w2, #8 \n" // 8 processed per loop
714 "mov v2.16b, v3.16b \n"
715 "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
716 "b.gt 1b \n"
717 : "+r"(src_ptr), // %0
718 "+r"(dst), // %1
719 "+r"(dst_width) // %2
720 :
721 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
722 );
723 }
724
ScaleARGBRowDown2Linear_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)725 void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
726 ptrdiff_t src_stride,
727 uint8_t* dst_argb,
728 int dst_width) {
729 (void)src_stride;
730 asm volatile(
731 "1: \n"
732 // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
733 "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
734 "subs %w2, %w2, #8 \n" // 8 processed per loop
735
736 "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
737 "urhadd v1.16b, v2.16b, v3.16b \n"
738 "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
739 "b.gt 1b \n"
740 : "+r"(src_argb), // %0
741 "+r"(dst_argb), // %1
742 "+r"(dst_width) // %2
743 :
744 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
745 );
746 }
747
ScaleARGBRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)748 void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
749 ptrdiff_t src_stride,
750 uint8_t* dst,
751 int dst_width) {
752 asm volatile(
753 // change the stride to row 2 pointer
754 "add %1, %1, %0 \n"
755 "1: \n"
756 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB
757 "subs %w3, %w3, #8 \n" // 8 processed per loop.
758 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
759 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
760 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
761 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
762 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
763 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
764 "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
765 "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
766 "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
767 "rshrn v0.8b, v0.8h, #2 \n" // round and pack
768 "rshrn v1.8b, v1.8h, #2 \n"
769 "rshrn v2.8b, v2.8h, #2 \n"
770 "rshrn v3.8b, v3.8h, #2 \n"
771 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
772 "b.gt 1b \n"
773 : "+r"(src_ptr), // %0
774 "+r"(src_stride), // %1
775 "+r"(dst), // %2
776 "+r"(dst_width) // %3
777 :
778 : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
779 }
780
781 // Reads 4 pixels at a time.
782 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)783 void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
784 ptrdiff_t src_stride,
785 int src_stepx,
786 uint8_t* dst_argb,
787 int dst_width) {
788 (void)src_stride;
789 asm volatile(
790 "1: \n"
791 "ld1 {v0.s}[0], [%0], %3 \n"
792 "ld1 {v0.s}[1], [%0], %3 \n"
793 "ld1 {v0.s}[2], [%0], %3 \n"
794 "ld1 {v0.s}[3], [%0], %3 \n"
795 "subs %w2, %w2, #4 \n" // 4 pixels per loop.
796 "st1 {v0.16b}, [%1], #16 \n"
797 "b.gt 1b \n"
798 : "+r"(src_argb), // %0
799 "+r"(dst_argb), // %1
800 "+r"(dst_width) // %2
801 : "r"((int64_t)(src_stepx * 4)) // %3
802 : "memory", "cc", "v0");
803 }
804
805 // Reads 4 pixels at a time.
806 // Alignment requirement: src_argb 4 byte aligned.
807 // TODO(Yang Zhang): Might be worth another optimization pass in future.
808 // It could be upgraded to 8 pixels at a time to start with.
ScaleARGBRowDownEvenBox_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)809 void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
810 ptrdiff_t src_stride,
811 int src_stepx,
812 uint8_t* dst_argb,
813 int dst_width) {
814 asm volatile(
815 "add %1, %1, %0 \n"
816 "1: \n"
817 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
818 "ld1 {v1.8b}, [%1], %4 \n"
819 "ld1 {v2.8b}, [%0], %4 \n"
820 "ld1 {v3.8b}, [%1], %4 \n"
821 "ld1 {v4.8b}, [%0], %4 \n"
822 "ld1 {v5.8b}, [%1], %4 \n"
823 "ld1 {v6.8b}, [%0], %4 \n"
824 "ld1 {v7.8b}, [%1], %4 \n"
825 "uaddl v0.8h, v0.8b, v1.8b \n"
826 "uaddl v2.8h, v2.8b, v3.8b \n"
827 "uaddl v4.8h, v4.8b, v5.8b \n"
828 "uaddl v6.8h, v6.8b, v7.8b \n"
829 "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
830 "mov v0.d[1], v2.d[0] \n"
831 "mov v2.d[0], v16.d[1] \n"
832 "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
833 "mov v4.d[1], v6.d[0] \n"
834 "mov v6.d[0], v16.d[1] \n"
835 "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
836 "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
837 "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
838 "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
839 "subs %w3, %w3, #4 \n" // 4 pixels per loop.
840 "st1 {v0.16b}, [%2], #16 \n"
841 "b.gt 1b \n"
842 : "+r"(src_argb), // %0
843 "+r"(src_stride), // %1
844 "+r"(dst_argb), // %2
845 "+r"(dst_width) // %3
846 : "r"((int64_t)(src_stepx * 4)) // %4
847 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
848 }
849
850 // TODO(Yang Zhang): Investigate less load instructions for
851 // the x/dx stepping
852 #define LOAD1_DATA32_LANE(vn, n) \
853 "lsr %5, %3, #16 \n" \
854 "add %6, %1, %5, lsl #2 \n" \
855 "add %3, %3, %4 \n" \
856 "ld1 {" #vn ".s}[" #n "], [%6] \n"
857
ScaleARGBCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)858 void ScaleARGBCols_NEON(uint8_t* dst_argb,
859 const uint8_t* src_argb,
860 int dst_width,
861 int x,
862 int dx) {
863 const uint8_t* src_tmp = src_argb;
864 int64_t x64 = (int64_t)x; // NOLINT
865 int64_t dx64 = (int64_t)dx; // NOLINT
866 int64_t tmp64;
867 asm volatile(
868 "1: \n"
869 // clang-format off
870 LOAD1_DATA32_LANE(v0, 0)
871 LOAD1_DATA32_LANE(v0, 1)
872 LOAD1_DATA32_LANE(v0, 2)
873 LOAD1_DATA32_LANE(v0, 3)
874 LOAD1_DATA32_LANE(v1, 0)
875 LOAD1_DATA32_LANE(v1, 1)
876 LOAD1_DATA32_LANE(v1, 2)
877 LOAD1_DATA32_LANE(v1, 3)
878 // clang-format on
879 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
880 "subs %w2, %w2, #8 \n" // 8 processed per loop
881 "b.gt 1b \n"
882 : "+r"(dst_argb), // %0
883 "+r"(src_argb), // %1
884 "+r"(dst_width), // %2
885 "+r"(x64), // %3
886 "+r"(dx64), // %4
887 "=&r"(tmp64), // %5
888 "+r"(src_tmp) // %6
889 :
890 : "memory", "cc", "v0", "v1");
891 }
892
893 #undef LOAD1_DATA32_LANE
894
895 // TODO(Yang Zhang): Investigate less load instructions for
896 // the x/dx stepping
897 #define LOAD2_DATA32_LANE(vn1, vn2, n) \
898 "lsr %5, %3, #16 \n" \
899 "add %6, %1, %5, lsl #2 \n" \
900 "add %3, %3, %4 \n" \
901 "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
902
ScaleARGBFilterCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)903 void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
904 const uint8_t* src_argb,
905 int dst_width,
906 int x,
907 int dx) {
908 int dx_offset[4] = {0, 1, 2, 3};
909 int* tmp = dx_offset;
910 const uint8_t* src_tmp = src_argb;
911 int64_t x64 = (int64_t)x; // NOLINT
912 int64_t dx64 = (int64_t)dx; // NOLINT
913 asm volatile (
914 "dup v0.4s, %w3 \n" // x
915 "dup v1.4s, %w4 \n" // dx
916 "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
917 "shl v6.4s, v1.4s, #2 \n" // 4 * dx
918 "mul v1.4s, v1.4s, v2.4s \n"
919 "movi v3.16b, #0x7f \n" // 0x7F
920 "movi v4.8h, #0x7f \n" // 0x7F
921 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
922 "add v5.4s, v1.4s, v0.4s \n"
923 "1: \n"
924 // d0, d1: a
925 // d2, d3: b
926 LOAD2_DATA32_LANE(v0, v1, 0)
927 LOAD2_DATA32_LANE(v0, v1, 1)
928 LOAD2_DATA32_LANE(v0, v1, 2)
929 LOAD2_DATA32_LANE(v0, v1, 3)
930 "shrn v2.4h, v5.4s, #9 \n"
931 "and v2.8b, v2.8b, v4.8b \n"
932 "dup v16.8b, v2.b[0] \n"
933 "dup v17.8b, v2.b[2] \n"
934 "dup v18.8b, v2.b[4] \n"
935 "dup v19.8b, v2.b[6] \n"
936 "ext v2.8b, v16.8b, v17.8b, #4 \n"
937 "ext v17.8b, v18.8b, v19.8b, #4 \n"
938 "ins v2.d[1], v17.d[0] \n" // f
939 "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f
940 "umull v16.8h, v0.8b, v7.8b \n"
941 "umull2 v17.8h, v0.16b, v7.16b \n"
942 "umull v18.8h, v1.8b, v2.8b \n"
943 "umull2 v19.8h, v1.16b, v2.16b \n"
944 "add v16.8h, v16.8h, v18.8h \n"
945 "add v17.8h, v17.8h, v19.8h \n"
946 "shrn v0.8b, v16.8h, #7 \n"
947 "shrn2 v0.16b, v17.8h, #7 \n"
948
949 "st1 {v0.4s}, [%0], #16 \n" // store pixels
950 "add v5.4s, v5.4s, v6.4s \n"
951 "subs %w2, %w2, #4 \n" // 4 processed per loop
952 "b.gt 1b \n"
953 : "+r"(dst_argb), // %0
954 "+r"(src_argb), // %1
955 "+r"(dst_width), // %2
956 "+r"(x64), // %3
957 "+r"(dx64), // %4
958 "+r"(tmp), // %5
959 "+r"(src_tmp) // %6
960 :
961 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
962 "v6", "v7", "v16", "v17", "v18", "v19"
963 );
964 }
965
966 #undef LOAD2_DATA32_LANE
967
968 // Read 16x2 average down and write 8x1.
ScaleRowDown2Box_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)969 void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
970 ptrdiff_t src_stride,
971 uint16_t* dst,
972 int dst_width) {
973 asm volatile(
974 // change the stride to row 2 pointer
975 "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
976 "1: \n"
977 "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
978 "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
979 "subs %w3, %w3, #8 \n" // 8 processed per loop
980 "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
981 "uaddlp v1.4s, v1.8h \n"
982 "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
983 "uadalp v1.4s, v3.8h \n"
984 "rshrn v0.4h, v0.4s, #2 \n" // round and pack
985 "rshrn2 v0.8h, v1.4s, #2 \n"
986 "st1 {v0.8h}, [%2], #16 \n"
987 "b.gt 1b \n"
988 : "+r"(src_ptr), // %0
989 "+r"(src_stride), // %1
990 "+r"(dst), // %2
991 "+r"(dst_width) // %3
992 :
993 : "v0", "v1", "v2", "v3" // Clobber List
994 );
995 }
996
997 // Read 8x2 upsample with filtering and write 16x1.
998 // Actually reads an extra pixel, so 9x2.
ScaleRowUp2_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)999 void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
1000 ptrdiff_t src_stride,
1001 uint16_t* dst,
1002 int dst_width) {
1003 asm volatile(
1004 "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
1005 "movi v0.8h, #9 \n" // constants
1006 "movi v1.4s, #3 \n"
1007
1008 "1: \n"
1009 "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
1010 "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
1011 "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
1012 "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
1013 "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
1014 "umull v16.4s, v3.4h, v0.4h \n"
1015 "umull2 v7.4s, v3.8h, v0.8h \n"
1016 "umull v18.4s, v4.4h, v0.4h \n"
1017 "umull2 v17.4s, v4.8h, v0.8h \n"
1018 "uaddw v16.4s, v16.4s, v6.4h \n"
1019 "uaddl2 v19.4s, v6.8h, v3.8h \n"
1020 "uaddl v3.4s, v6.4h, v3.4h \n"
1021 "uaddw2 v6.4s, v7.4s, v6.8h \n"
1022 "uaddl2 v7.4s, v5.8h, v4.8h \n"
1023 "uaddl v4.4s, v5.4h, v4.4h \n"
1024 "uaddw v18.4s, v18.4s, v5.4h \n"
1025 "mla v16.4s, v4.4s, v1.4s \n"
1026 "mla v18.4s, v3.4s, v1.4s \n"
1027 "mla v6.4s, v7.4s, v1.4s \n"
1028 "uaddw2 v4.4s, v17.4s, v5.8h \n"
1029 "uqrshrn v16.4h, v16.4s, #4 \n"
1030 "mla v4.4s, v19.4s, v1.4s \n"
1031 "uqrshrn2 v16.8h, v6.4s, #4 \n"
1032 "uqrshrn v17.4h, v18.4s, #4 \n"
1033 "uqrshrn2 v17.8h, v4.4s, #4 \n"
1034 "st2 {v16.8h-v17.8h}, [%2], #32 \n"
1035 "b.gt 1b \n"
1036 : "+r"(src_ptr), // %0
1037 "+r"(src_stride), // %1
1038 "+r"(dst), // %2
1039 "+r"(dst_width) // %3
1040 : "r"(2LL), // %4
1041 "r"(14LL) // %5
1042 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
1043 "v19" // Clobber List
1044 );
1045 }
1046
1047 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
1048
1049 #ifdef __cplusplus
1050 } // extern "C"
1051 } // namespace libyuv
1052 #endif
1053