1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/scale.h"
13 #include "libyuv/scale_row.h"
14
15 #ifdef __cplusplus
16 namespace libyuv {
17 extern "C" {
18 #endif
19
20 // This module is for GCC Neon armv8 64 bit.
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
22
23 // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)24 void ScaleRowDown2_NEON(const uint8_t* src_ptr,
25 ptrdiff_t src_stride,
26 uint8_t* dst,
27 int dst_width) {
28 (void)src_stride;
29 asm volatile(
30 "1: \n"
31 // load even pixels into v0, odd into v1
32 "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
33 "subs %w2, %w2, #16 \n" // 16 processed per loop
34 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
35 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
36 "b.gt 1b \n"
37 : "+r"(src_ptr), // %0
38 "+r"(dst), // %1
39 "+r"(dst_width) // %2
40 :
41 : "v0", "v1" // Clobber List
42 );
43 }
44
45 // Read 32x1 average down and write 16x1.
ScaleRowDown2Linear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)46 void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
47 ptrdiff_t src_stride,
48 uint8_t* dst,
49 int dst_width) {
50 (void)src_stride;
51 asm volatile(
52 "1: \n"
53 // load even pixels into v0, odd into v1
54 "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
55 "subs %w2, %w2, #16 \n" // 16 processed per loop
56 "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
57 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
58 "st1 {v0.16b}, [%1], #16 \n"
59 "b.gt 1b \n"
60 : "+r"(src_ptr), // %0
61 "+r"(dst), // %1
62 "+r"(dst_width) // %2
63 :
64 : "v0", "v1" // Clobber List
65 );
66 }
67
68 // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)69 void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
70 ptrdiff_t src_stride,
71 uint8_t* dst,
72 int dst_width) {
73 asm volatile(
74 // change the stride to row 2 pointer
75 "add %1, %1, %0 \n"
76 "1: \n"
77 "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
78 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
79 "subs %w3, %w3, #16 \n" // 16 processed per loop
80 "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
81 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
82 "uaddlp v1.8h, v1.16b \n"
83 "prfm pldl1keep, [%1, 448] \n"
84 "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
85 "uadalp v1.8h, v3.16b \n"
86 "rshrn v0.8b, v0.8h, #2 \n" // round and pack
87 "rshrn2 v0.16b, v1.8h, #2 \n"
88 "st1 {v0.16b}, [%2], #16 \n"
89 "b.gt 1b \n"
90 : "+r"(src_ptr), // %0
91 "+r"(src_stride), // %1
92 "+r"(dst), // %2
93 "+r"(dst_width) // %3
94 :
95 : "v0", "v1", "v2", "v3" // Clobber List
96 );
97 }
98
ScaleRowDown4_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)99 void ScaleRowDown4_NEON(const uint8_t* src_ptr,
100 ptrdiff_t src_stride,
101 uint8_t* dst_ptr,
102 int dst_width) {
103 (void)src_stride;
104 asm volatile(
105 "1: \n"
106 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
107 "subs %w2, %w2, #8 \n" // 8 processed per loop
108 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
109 "st1 {v2.8b}, [%1], #8 \n"
110 "b.gt 1b \n"
111 : "+r"(src_ptr), // %0
112 "+r"(dst_ptr), // %1
113 "+r"(dst_width) // %2
114 :
115 : "v0", "v1", "v2", "v3", "memory", "cc");
116 }
117
ScaleRowDown4Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)118 void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
119 ptrdiff_t src_stride,
120 uint8_t* dst_ptr,
121 int dst_width) {
122 const uint8_t* src_ptr1 = src_ptr + src_stride;
123 const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
124 const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
125 asm volatile(
126 "1: \n"
127 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
128 "ld1 {v1.16b}, [%2], #16 \n"
129 "ld1 {v2.16b}, [%3], #16 \n"
130 "ld1 {v3.16b}, [%4], #16 \n"
131 "subs %w5, %w5, #4 \n"
132 "uaddlp v0.8h, v0.16b \n"
133 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
134 "uadalp v0.8h, v1.16b \n"
135 "prfm pldl1keep, [%2, 448] \n"
136 "uadalp v0.8h, v2.16b \n"
137 "prfm pldl1keep, [%3, 448] \n"
138 "uadalp v0.8h, v3.16b \n"
139 "prfm pldl1keep, [%4, 448] \n"
140 "addp v0.8h, v0.8h, v0.8h \n"
141 "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
142 "st1 {v0.s}[0], [%1], #4 \n"
143 "b.gt 1b \n"
144 : "+r"(src_ptr), // %0
145 "+r"(dst_ptr), // %1
146 "+r"(src_ptr1), // %2
147 "+r"(src_ptr2), // %3
148 "+r"(src_ptr3), // %4
149 "+r"(dst_width) // %5
150 :
151 : "v0", "v1", "v2", "v3", "memory", "cc");
152 }
153
154 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
155 // to load up the every 4th pixel into a 4 different registers.
156 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)157 void ScaleRowDown34_NEON(const uint8_t* src_ptr,
158 ptrdiff_t src_stride,
159 uint8_t* dst_ptr,
160 int dst_width) {
161 (void)src_stride;
162 asm volatile(
163 "1: \n"
164 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
165 "subs %w2, %w2, #24 \n"
166 "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
167 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
168 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
169 "b.gt 1b \n"
170 : "+r"(src_ptr), // %0
171 "+r"(dst_ptr), // %1
172 "+r"(dst_width) // %2
173 :
174 : "v0", "v1", "v2", "v3", "memory", "cc");
175 }
176
ScaleRowDown34_0_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)177 void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
178 ptrdiff_t src_stride,
179 uint8_t* dst_ptr,
180 int dst_width) {
181 asm volatile(
182 "movi v20.8b, #3 \n"
183 "add %3, %3, %0 \n"
184 "1: \n"
185 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
186 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
187 "subs %w2, %w2, #24 \n"
188
189 // filter src line 0 with src line 1
190 // expand chars to shorts to allow for room
191 // when adding lines together
192 "ushll v16.8h, v4.8b, #0 \n"
193 "ushll v17.8h, v5.8b, #0 \n"
194 "ushll v18.8h, v6.8b, #0 \n"
195 "ushll v19.8h, v7.8b, #0 \n"
196
197 // 3 * line_0 + line_1
198 "umlal v16.8h, v0.8b, v20.8b \n"
199 "umlal v17.8h, v1.8b, v20.8b \n"
200 "umlal v18.8h, v2.8b, v20.8b \n"
201 "umlal v19.8h, v3.8b, v20.8b \n"
202 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
203
204 // (3 * line_0 + line_1) >> 2
205 "uqrshrn v0.8b, v16.8h, #2 \n"
206 "uqrshrn v1.8b, v17.8h, #2 \n"
207 "uqrshrn v2.8b, v18.8h, #2 \n"
208 "uqrshrn v3.8b, v19.8h, #2 \n"
209 "prfm pldl1keep, [%3, 448] \n"
210
211 // a0 = (src[0] * 3 + s[1] * 1) >> 2
212 "ushll v16.8h, v1.8b, #0 \n"
213 "umlal v16.8h, v0.8b, v20.8b \n"
214 "uqrshrn v0.8b, v16.8h, #2 \n"
215
216 // a1 = (src[1] * 1 + s[2] * 1) >> 1
217 "urhadd v1.8b, v1.8b, v2.8b \n"
218
219 // a2 = (src[2] * 1 + s[3] * 3) >> 2
220 "ushll v16.8h, v2.8b, #0 \n"
221 "umlal v16.8h, v3.8b, v20.8b \n"
222 "uqrshrn v2.8b, v16.8h, #2 \n"
223
224 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
225
226 "b.gt 1b \n"
227 : "+r"(src_ptr), // %0
228 "+r"(dst_ptr), // %1
229 "+r"(dst_width), // %2
230 "+r"(src_stride) // %3
231 :
232 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
233 "v19", "v20", "memory", "cc");
234 }
235
ScaleRowDown34_1_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)236 void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
237 ptrdiff_t src_stride,
238 uint8_t* dst_ptr,
239 int dst_width) {
240 asm volatile(
241 "movi v20.8b, #3 \n"
242 "add %3, %3, %0 \n"
243 "1: \n"
244 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
245 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
246 "subs %w2, %w2, #24 \n"
247 // average src line 0 with src line 1
248 "urhadd v0.8b, v0.8b, v4.8b \n"
249 "urhadd v1.8b, v1.8b, v5.8b \n"
250 "urhadd v2.8b, v2.8b, v6.8b \n"
251 "urhadd v3.8b, v3.8b, v7.8b \n"
252 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
253
254 // a0 = (src[0] * 3 + s[1] * 1) >> 2
255 "ushll v4.8h, v1.8b, #0 \n"
256 "umlal v4.8h, v0.8b, v20.8b \n"
257 "uqrshrn v0.8b, v4.8h, #2 \n"
258 "prfm pldl1keep, [%3, 448] \n"
259
260 // a1 = (src[1] * 1 + s[2] * 1) >> 1
261 "urhadd v1.8b, v1.8b, v2.8b \n"
262
263 // a2 = (src[2] * 1 + s[3] * 3) >> 2
264 "ushll v4.8h, v2.8b, #0 \n"
265 "umlal v4.8h, v3.8b, v20.8b \n"
266 "uqrshrn v2.8b, v4.8h, #2 \n"
267
268 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
269 "b.gt 1b \n"
270 : "+r"(src_ptr), // %0
271 "+r"(dst_ptr), // %1
272 "+r"(dst_width), // %2
273 "+r"(src_stride) // %3
274 :
275 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
276 }
277
278 static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
279 22, 24, 27, 30, 0, 0, 0, 0};
280 static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20,
281 34, 6, 22, 35, 0, 0, 0, 0};
282 static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
283 65536 / 12, 65536 / 12, 65536 / 12,
284 65536 / 12, 65536 / 12};
285 static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
286 65536 / 18, 65536 / 18, 65536 / 18,
287 65536 / 18, 65536 / 18};
288
289 // 32 -> 12
ScaleRowDown38_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)290 void ScaleRowDown38_NEON(const uint8_t* src_ptr,
291 ptrdiff_t src_stride,
292 uint8_t* dst_ptr,
293 int dst_width) {
294 (void)src_stride;
295 asm volatile(
296 "ld1 {v3.16b}, [%3] \n"
297 "1: \n"
298 "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
299 "subs %w2, %w2, #12 \n"
300 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
301 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
302 "st1 {v2.8b}, [%1], #8 \n"
303 "st1 {v2.s}[2], [%1], #4 \n"
304 "b.gt 1b \n"
305 : "+r"(src_ptr), // %0
306 "+r"(dst_ptr), // %1
307 "+r"(dst_width) // %2
308 : "r"(&kShuf38) // %3
309 : "v0", "v1", "v2", "v3", "memory", "cc");
310 }
311
312 // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)313 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
314 ptrdiff_t src_stride,
315 uint8_t* dst_ptr,
316 int dst_width) {
317 const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
318 ptrdiff_t tmp_src_stride = src_stride;
319
320 asm volatile(
321 "ld1 {v29.8h}, [%5] \n"
322 "ld1 {v30.16b}, [%6] \n"
323 "ld1 {v31.8h}, [%7] \n"
324 "add %2, %2, %0 \n"
325 "1: \n"
326
327 // 00 40 01 41 02 42 03 43
328 // 10 50 11 51 12 52 13 53
329 // 20 60 21 61 22 62 23 63
330 // 30 70 31 71 32 72 33 73
331 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
332 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
333 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
334 "subs %w4, %w4, #12 \n"
335
336 // Shuffle the input data around to get align the data
337 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
338 // 00 10 01 11 02 12 03 13
339 // 40 50 41 51 42 52 43 53
340 "trn1 v20.8b, v0.8b, v1.8b \n"
341 "trn2 v21.8b, v0.8b, v1.8b \n"
342 "trn1 v22.8b, v4.8b, v5.8b \n"
343 "trn2 v23.8b, v4.8b, v5.8b \n"
344 "trn1 v24.8b, v16.8b, v17.8b \n"
345 "trn2 v25.8b, v16.8b, v17.8b \n"
346
347 // 20 30 21 31 22 32 23 33
348 // 60 70 61 71 62 72 63 73
349 "trn1 v0.8b, v2.8b, v3.8b \n"
350 "trn2 v1.8b, v2.8b, v3.8b \n"
351 "trn1 v4.8b, v6.8b, v7.8b \n"
352 "trn2 v5.8b, v6.8b, v7.8b \n"
353 "trn1 v16.8b, v18.8b, v19.8b \n"
354 "trn2 v17.8b, v18.8b, v19.8b \n"
355
356 // 00+10 01+11 02+12 03+13
357 // 40+50 41+51 42+52 43+53
358 "uaddlp v20.4h, v20.8b \n"
359 "uaddlp v21.4h, v21.8b \n"
360 "uaddlp v22.4h, v22.8b \n"
361 "uaddlp v23.4h, v23.8b \n"
362 "uaddlp v24.4h, v24.8b \n"
363 "uaddlp v25.4h, v25.8b \n"
364
365 // 60+70 61+71 62+72 63+73
366 "uaddlp v1.4h, v1.8b \n"
367 "uaddlp v5.4h, v5.8b \n"
368 "uaddlp v17.4h, v17.8b \n"
369
370 // combine source lines
371 "add v20.4h, v20.4h, v22.4h \n"
372 "add v21.4h, v21.4h, v23.4h \n"
373 "add v20.4h, v20.4h, v24.4h \n"
374 "add v21.4h, v21.4h, v25.4h \n"
375 "add v2.4h, v1.4h, v5.4h \n"
376 "add v2.4h, v2.4h, v17.4h \n"
377
378 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
379 // + s[6 + st * 1] + s[7 + st * 1]
380 // + s[6 + st * 2] + s[7 + st * 2]) / 6
381 "sqrdmulh v2.8h, v2.8h, v29.8h \n"
382 "xtn v2.8b, v2.8h \n"
383
384 // Shuffle 2,3 reg around so that 2 can be added to the
385 // 0,1 reg and 3 can be added to the 4,5 reg. This
386 // requires expanding from u8 to u16 as the 0,1 and 4,5
387 // registers are already expanded. Then do transposes
388 // to get aligned.
389 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
390 "ushll v16.8h, v16.8b, #0 \n"
391 "uaddl v0.8h, v0.8b, v4.8b \n"
392
393 // combine source lines
394 "add v0.8h, v0.8h, v16.8h \n"
395
396 // xx 20 xx 21 xx 22 xx 23
397 // xx 30 xx 31 xx 32 xx 33
398 "trn1 v1.8h, v0.8h, v0.8h \n"
399 "trn2 v4.8h, v0.8h, v0.8h \n"
400 "xtn v0.4h, v1.4s \n"
401 "xtn v4.4h, v4.4s \n"
402 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
403
404 // 0+1+2, 3+4+5
405 "add v20.8h, v20.8h, v0.8h \n"
406 "add v21.8h, v21.8h, v4.8h \n"
407 "prfm pldl1keep, [%2, 448] \n"
408
409 // Need to divide, but can't downshift as the the value
410 // isn't a power of 2. So multiply by 65536 / n
411 // and take the upper 16 bits.
412 "sqrdmulh v0.8h, v20.8h, v31.8h \n"
413 "sqrdmulh v1.8h, v21.8h, v31.8h \n"
414 "prfm pldl1keep, [%3, 448] \n"
415
416 // Align for table lookup, vtbl requires registers to be adjacent
417 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
418
419 "st1 {v3.8b}, [%1], #8 \n"
420 "st1 {v3.s}[2], [%1], #4 \n"
421 "b.gt 1b \n"
422 : "+r"(src_ptr), // %0
423 "+r"(dst_ptr), // %1
424 "+r"(tmp_src_stride), // %2
425 "+r"(src_ptr1), // %3
426 "+r"(dst_width) // %4
427 : "r"(&kMult38_Div6), // %5
428 "r"(&kShuf38_2), // %6
429 "r"(&kMult38_Div9) // %7
430 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
431 "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
432 "memory", "cc");
433 }
434
435 // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)436 void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
437 ptrdiff_t src_stride,
438 uint8_t* dst_ptr,
439 int dst_width) {
440 // TODO(fbarchard): use src_stride directly for clang 3.5+.
441 ptrdiff_t tmp_src_stride = src_stride;
442 asm volatile(
443 "ld1 {v30.8h}, [%4] \n"
444 "ld1 {v31.16b}, [%5] \n"
445 "add %2, %2, %0 \n"
446 "1: \n"
447
448 // 00 40 01 41 02 42 03 43
449 // 10 50 11 51 12 52 13 53
450 // 20 60 21 61 22 62 23 63
451 // 30 70 31 71 32 72 33 73
452 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
453 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
454 "subs %w3, %w3, #12 \n"
455
456 // Shuffle the input data around to get align the data
457 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
458 // 00 10 01 11 02 12 03 13
459 // 40 50 41 51 42 52 43 53
460 "trn1 v16.8b, v0.8b, v1.8b \n"
461 "trn2 v17.8b, v0.8b, v1.8b \n"
462 "trn1 v18.8b, v4.8b, v5.8b \n"
463 "trn2 v19.8b, v4.8b, v5.8b \n"
464
465 // 20 30 21 31 22 32 23 33
466 // 60 70 61 71 62 72 63 73
467 "trn1 v0.8b, v2.8b, v3.8b \n"
468 "trn2 v1.8b, v2.8b, v3.8b \n"
469 "trn1 v4.8b, v6.8b, v7.8b \n"
470 "trn2 v5.8b, v6.8b, v7.8b \n"
471
472 // 00+10 01+11 02+12 03+13
473 // 40+50 41+51 42+52 43+53
474 "uaddlp v16.4h, v16.8b \n"
475 "uaddlp v17.4h, v17.8b \n"
476 "uaddlp v18.4h, v18.8b \n"
477 "uaddlp v19.4h, v19.8b \n"
478
479 // 60+70 61+71 62+72 63+73
480 "uaddlp v1.4h, v1.8b \n"
481 "uaddlp v5.4h, v5.8b \n"
482
483 // combine source lines
484 "add v16.4h, v16.4h, v18.4h \n"
485 "add v17.4h, v17.4h, v19.4h \n"
486 "add v2.4h, v1.4h, v5.4h \n"
487
488 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
489 "uqrshrn v2.8b, v2.8h, #2 \n"
490
491 // Shuffle 2,3 reg around so that 2 can be added to the
492 // 0,1 reg and 3 can be added to the 4,5 reg. This
493 // requires expanding from u8 to u16 as the 0,1 and 4,5
494 // registers are already expanded. Then do transposes
495 // to get aligned.
496 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
497
498 // combine source lines
499 "uaddl v0.8h, v0.8b, v4.8b \n"
500
501 // xx 20 xx 21 xx 22 xx 23
502 // xx 30 xx 31 xx 32 xx 33
503 "trn1 v1.8h, v0.8h, v0.8h \n"
504 "trn2 v4.8h, v0.8h, v0.8h \n"
505 "xtn v0.4h, v1.4s \n"
506 "xtn v4.4h, v4.4s \n"
507 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
508
509 // 0+1+2, 3+4+5
510 "add v16.8h, v16.8h, v0.8h \n"
511 "add v17.8h, v17.8h, v4.8h \n"
512 "prfm pldl1keep, [%2, 448] \n"
513
514 // Need to divide, but can't downshift as the the value
515 // isn't a power of 2. So multiply by 65536 / n
516 // and take the upper 16 bits.
517 "sqrdmulh v0.8h, v16.8h, v30.8h \n"
518 "sqrdmulh v1.8h, v17.8h, v30.8h \n"
519
520 // Align for table lookup, vtbl requires registers to
521 // be adjacent
522
523 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
524
525 "st1 {v3.8b}, [%1], #8 \n"
526 "st1 {v3.s}[2], [%1], #4 \n"
527 "b.gt 1b \n"
528 : "+r"(src_ptr), // %0
529 "+r"(dst_ptr), // %1
530 "+r"(tmp_src_stride), // %2
531 "+r"(dst_width) // %3
532 : "r"(&kMult38_Div6), // %4
533 "r"(&kShuf38_2) // %5
534 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
535 "v19", "v30", "v31", "memory", "cc");
536 }
537
538 // Add a row of bytes to a row of shorts. Used for box filter.
539 // Reads 16 bytes and accumulates to 16 shorts at a time.
ScaleAddRow_NEON(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)540 void ScaleAddRow_NEON(const uint8_t* src_ptr,
541 uint16_t* dst_ptr,
542 int src_width) {
543 asm volatile(
544 "1: \n"
545 "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
546 "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
547 "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
548 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
549 "uaddw v1.8h, v1.8h, v0.8b \n"
550 "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
551 "subs %w2, %w2, #16 \n" // 16 processed per loop
552 "b.gt 1b \n"
553 : "+r"(src_ptr), // %0
554 "+r"(dst_ptr), // %1
555 "+r"(src_width) // %2
556 :
557 : "memory", "cc", "v0", "v1", "v2" // Clobber List
558 );
559 }
560
561 // TODO(Yang Zhang): Investigate less load instructions for
562 // the x/dx stepping
563 #define LOAD2_DATA8_LANE(n) \
564 "lsr %5, %3, #16 \n" \
565 "add %6, %1, %5 \n" \
566 "add %3, %3, %4 \n" \
567 "ld2 {v4.b, v5.b}[" #n "], [%6] \n"
568
569 // The NEON version mimics this formula (from row_common.cc):
570 // #define BLENDER(a, b, f) (uint8_t)((int)(a) +
571 // ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
572
ScaleFilterCols_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)573 void ScaleFilterCols_NEON(uint8_t* dst_ptr,
574 const uint8_t* src_ptr,
575 int dst_width,
576 int x,
577 int dx) {
578 int dx_offset[4] = {0, 1, 2, 3};
579 int* tmp = dx_offset;
580 const uint8_t* src_tmp = src_ptr;
581 int64_t x64 = (int64_t)x; // NOLINT
582 int64_t dx64 = (int64_t)dx; // NOLINT
583 asm volatile (
584 "dup v0.4s, %w3 \n" // x
585 "dup v1.4s, %w4 \n" // dx
586 "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
587 "shl v3.4s, v1.4s, #2 \n" // 4 * dx
588 "mul v1.4s, v1.4s, v2.4s \n"
589 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
590 "add v1.4s, v1.4s, v0.4s \n"
591 // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
592 "add v2.4s, v1.4s, v3.4s \n"
593 "shl v0.4s, v3.4s, #1 \n" // 8 * dx
594 "1: \n"
595 LOAD2_DATA8_LANE(0)
596 LOAD2_DATA8_LANE(1)
597 LOAD2_DATA8_LANE(2)
598 LOAD2_DATA8_LANE(3)
599 LOAD2_DATA8_LANE(4)
600 LOAD2_DATA8_LANE(5)
601 LOAD2_DATA8_LANE(6)
602 LOAD2_DATA8_LANE(7)
603 "mov v6.16b, v1.16b \n"
604 "mov v7.16b, v2.16b \n"
605 "uzp1 v6.8h, v6.8h, v7.8h \n"
606 "ushll v4.8h, v4.8b, #0 \n"
607 "ushll v5.8h, v5.8b, #0 \n"
608 "ssubl v16.4s, v5.4h, v4.4h \n"
609 "ssubl2 v17.4s, v5.8h, v4.8h \n"
610 "ushll v7.4s, v6.4h, #0 \n"
611 "ushll2 v6.4s, v6.8h, #0 \n"
612 "mul v16.4s, v16.4s, v7.4s \n"
613 "mul v17.4s, v17.4s, v6.4s \n"
614 "rshrn v6.4h, v16.4s, #16 \n"
615 "rshrn2 v6.8h, v17.4s, #16 \n"
616 "add v4.8h, v4.8h, v6.8h \n"
617 "xtn v4.8b, v4.8h \n"
618
619 "st1 {v4.8b}, [%0], #8 \n" // store pixels
620 "add v1.4s, v1.4s, v0.4s \n"
621 "add v2.4s, v2.4s, v0.4s \n"
622 "subs %w2, %w2, #8 \n" // 8 processed per loop
623 "b.gt 1b \n"
624 : "+r"(dst_ptr), // %0
625 "+r"(src_ptr), // %1
626 "+r"(dst_width), // %2
627 "+r"(x64), // %3
628 "+r"(dx64), // %4
629 "+r"(tmp), // %5
630 "+r"(src_tmp) // %6
631 :
632 : "memory", "cc", "v0", "v1", "v2", "v3",
633 "v4", "v5", "v6", "v7", "v16", "v17"
634 );
635 }
636
637 #undef LOAD2_DATA8_LANE
638
639 // 16x2 -> 16x1
ScaleFilterRows_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)640 void ScaleFilterRows_NEON(uint8_t* dst_ptr,
641 const uint8_t* src_ptr,
642 ptrdiff_t src_stride,
643 int dst_width,
644 int source_y_fraction) {
645 int y_fraction = 256 - source_y_fraction;
646 asm volatile(
647 "cmp %w4, #0 \n"
648 "b.eq 100f \n"
649 "add %2, %2, %1 \n"
650 "cmp %w4, #64 \n"
651 "b.eq 75f \n"
652 "cmp %w4, #128 \n"
653 "b.eq 50f \n"
654 "cmp %w4, #192 \n"
655 "b.eq 25f \n"
656
657 "dup v5.8b, %w4 \n"
658 "dup v4.8b, %w5 \n"
659 // General purpose row blend.
660 "1: \n"
661 "ld1 {v0.16b}, [%1], #16 \n"
662 "ld1 {v1.16b}, [%2], #16 \n"
663 "subs %w3, %w3, #16 \n"
664 "umull v6.8h, v0.8b, v4.8b \n"
665 "umull2 v7.8h, v0.16b, v4.16b \n"
666 "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
667 "umlal v6.8h, v1.8b, v5.8b \n"
668 "umlal2 v7.8h, v1.16b, v5.16b \n"
669 "prfm pldl1keep, [%2, 448] \n"
670 "rshrn v0.8b, v6.8h, #8 \n"
671 "rshrn2 v0.16b, v7.8h, #8 \n"
672 "st1 {v0.16b}, [%0], #16 \n"
673 "b.gt 1b \n"
674 "b 99f \n"
675
676 // Blend 25 / 75.
677 "25: \n"
678 "ld1 {v0.16b}, [%1], #16 \n"
679 "ld1 {v1.16b}, [%2], #16 \n"
680 "subs %w3, %w3, #16 \n"
681 "urhadd v0.16b, v0.16b, v1.16b \n"
682 "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
683 "urhadd v0.16b, v0.16b, v1.16b \n"
684 "prfm pldl1keep, [%2, 448] \n"
685 "st1 {v0.16b}, [%0], #16 \n"
686 "b.gt 25b \n"
687 "b 99f \n"
688
689 // Blend 50 / 50.
690 "50: \n"
691 "ld1 {v0.16b}, [%1], #16 \n"
692 "ld1 {v1.16b}, [%2], #16 \n"
693 "subs %w3, %w3, #16 \n"
694 "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
695 "urhadd v0.16b, v0.16b, v1.16b \n"
696 "prfm pldl1keep, [%2, 448] \n"
697 "st1 {v0.16b}, [%0], #16 \n"
698 "b.gt 50b \n"
699 "b 99f \n"
700
701 // Blend 75 / 25.
702 "75: \n"
703 "ld1 {v1.16b}, [%1], #16 \n"
704 "ld1 {v0.16b}, [%2], #16 \n"
705 "subs %w3, %w3, #16 \n"
706 "urhadd v0.16b, v0.16b, v1.16b \n"
707 "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
708 "urhadd v0.16b, v0.16b, v1.16b \n"
709 "prfm pldl1keep, [%2, 448] \n"
710 "st1 {v0.16b}, [%0], #16 \n"
711 "b.gt 75b \n"
712 "b 99f \n"
713
714 // Blend 100 / 0 - Copy row unchanged.
715 "100: \n"
716 "ld1 {v0.16b}, [%1], #16 \n"
717 "subs %w3, %w3, #16 \n"
718 "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
719 "st1 {v0.16b}, [%0], #16 \n"
720 "b.gt 100b \n"
721
722 "99: \n"
723 "st1 {v0.b}[15], [%0] \n"
724 : "+r"(dst_ptr), // %0
725 "+r"(src_ptr), // %1
726 "+r"(src_stride), // %2
727 "+r"(dst_width), // %3
728 "+r"(source_y_fraction), // %4
729 "+r"(y_fraction) // %5
730 :
731 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
732 }
733
ScaleARGBRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)734 void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
735 ptrdiff_t src_stride,
736 uint8_t* dst,
737 int dst_width) {
738 (void)src_stride;
739 asm volatile(
740 "1: \n"
741 // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
742 "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
743 "subs %w2, %w2, #8 \n" // 8 processed per loop
744 "mov v2.16b, v3.16b \n"
745 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
746 "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
747 "b.gt 1b \n"
748 : "+r"(src_ptr), // %0
749 "+r"(dst), // %1
750 "+r"(dst_width) // %2
751 :
752 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
753 );
754 }
755
ScaleARGBRowDown2Linear_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)756 void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
757 ptrdiff_t src_stride,
758 uint8_t* dst_argb,
759 int dst_width) {
760 (void)src_stride;
761 asm volatile(
762 "1: \n"
763 // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
764 "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
765 "subs %w2, %w2, #8 \n" // 8 processed per loop
766
767 "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
768 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
769 "urhadd v1.16b, v2.16b, v3.16b \n"
770 "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
771 "b.gt 1b \n"
772 : "+r"(src_argb), // %0
773 "+r"(dst_argb), // %1
774 "+r"(dst_width) // %2
775 :
776 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
777 );
778 }
779
ScaleARGBRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)780 void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
781 ptrdiff_t src_stride,
782 uint8_t* dst,
783 int dst_width) {
784 asm volatile(
785 // change the stride to row 2 pointer
786 "add %1, %1, %0 \n"
787 "1: \n"
788 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
789 "subs %w3, %w3, #8 \n" // 8 processed per loop.
790 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
791 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
792 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
793 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
794 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
795 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
796 "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
797 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
798 "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
799 "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
800 "prfm pldl1keep, [%1, 448] \n"
801 "rshrn v0.8b, v0.8h, #2 \n" // round and pack
802 "rshrn v1.8b, v1.8h, #2 \n"
803 "rshrn v2.8b, v2.8h, #2 \n"
804 "rshrn v3.8b, v3.8h, #2 \n"
805 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
806 "b.gt 1b \n"
807 : "+r"(src_ptr), // %0
808 "+r"(src_stride), // %1
809 "+r"(dst), // %2
810 "+r"(dst_width) // %3
811 :
812 : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
813 }
814
815 // Reads 4 pixels at a time.
816 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)817 void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
818 ptrdiff_t src_stride,
819 int src_stepx,
820 uint8_t* dst_argb,
821 int dst_width) {
822 (void)src_stride;
823 asm volatile(
824 "1: \n"
825 "ld1 {v0.s}[0], [%0], %3 \n"
826 "ld1 {v0.s}[1], [%0], %3 \n"
827 "ld1 {v0.s}[2], [%0], %3 \n"
828 "ld1 {v0.s}[3], [%0], %3 \n"
829 "subs %w2, %w2, #4 \n" // 4 pixels per loop.
830 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
831 "st1 {v0.16b}, [%1], #16 \n"
832 "b.gt 1b \n"
833 : "+r"(src_argb), // %0
834 "+r"(dst_argb), // %1
835 "+r"(dst_width) // %2
836 : "r"((int64_t)(src_stepx * 4)) // %3
837 : "memory", "cc", "v0");
838 }
839
840 // Reads 4 pixels at a time.
841 // Alignment requirement: src_argb 4 byte aligned.
842 // TODO(Yang Zhang): Might be worth another optimization pass in future.
843 // It could be upgraded to 8 pixels at a time to start with.
ScaleARGBRowDownEvenBox_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)844 void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
845 ptrdiff_t src_stride,
846 int src_stepx,
847 uint8_t* dst_argb,
848 int dst_width) {
849 asm volatile(
850 "add %1, %1, %0 \n"
851 "1: \n"
852 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
853 "ld1 {v1.8b}, [%1], %4 \n"
854 "ld1 {v2.8b}, [%0], %4 \n"
855 "ld1 {v3.8b}, [%1], %4 \n"
856 "ld1 {v4.8b}, [%0], %4 \n"
857 "ld1 {v5.8b}, [%1], %4 \n"
858 "ld1 {v6.8b}, [%0], %4 \n"
859 "ld1 {v7.8b}, [%1], %4 \n"
860 "uaddl v0.8h, v0.8b, v1.8b \n"
861 "uaddl v2.8h, v2.8b, v3.8b \n"
862 "uaddl v4.8h, v4.8b, v5.8b \n"
863 "uaddl v6.8h, v6.8b, v7.8b \n"
864 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
865 "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
866 "mov v0.d[1], v2.d[0] \n"
867 "mov v2.d[0], v16.d[1] \n"
868 "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
869 "mov v4.d[1], v6.d[0] \n"
870 "mov v6.d[0], v16.d[1] \n"
871 "prfm pldl1keep, [%1, 448] \n"
872 "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
873 "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
874 "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
875 "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
876 "subs %w3, %w3, #4 \n" // 4 pixels per loop.
877 "st1 {v0.16b}, [%2], #16 \n"
878 "b.gt 1b \n"
879 : "+r"(src_argb), // %0
880 "+r"(src_stride), // %1
881 "+r"(dst_argb), // %2
882 "+r"(dst_width) // %3
883 : "r"((int64_t)(src_stepx * 4)) // %4
884 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
885 }
886
887 // TODO(Yang Zhang): Investigate less load instructions for
888 // the x/dx stepping
889 #define LOAD1_DATA32_LANE(vn, n) \
890 "lsr %5, %3, #16 \n" \
891 "add %6, %1, %5, lsl #2 \n" \
892 "add %3, %3, %4 \n" \
893 "ld1 {" #vn ".s}[" #n "], [%6] \n"
894
ScaleARGBCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)895 void ScaleARGBCols_NEON(uint8_t* dst_argb,
896 const uint8_t* src_argb,
897 int dst_width,
898 int x,
899 int dx) {
900 const uint8_t* src_tmp = src_argb;
901 int64_t x64 = (int64_t)x; // NOLINT
902 int64_t dx64 = (int64_t)dx; // NOLINT
903 int64_t tmp64;
904 asm volatile(
905 "1: \n"
906 // clang-format off
907 LOAD1_DATA32_LANE(v0, 0)
908 LOAD1_DATA32_LANE(v0, 1)
909 LOAD1_DATA32_LANE(v0, 2)
910 LOAD1_DATA32_LANE(v0, 3)
911 LOAD1_DATA32_LANE(v1, 0)
912 LOAD1_DATA32_LANE(v1, 1)
913 LOAD1_DATA32_LANE(v1, 2)
914 LOAD1_DATA32_LANE(v1, 3)
915 "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
916 // clang-format on
917 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
918 "subs %w2, %w2, #8 \n" // 8 processed per loop
919 "b.gt 1b \n"
920 : "+r"(dst_argb), // %0
921 "+r"(src_argb), // %1
922 "+r"(dst_width), // %2
923 "+r"(x64), // %3
924 "+r"(dx64), // %4
925 "=&r"(tmp64), // %5
926 "+r"(src_tmp) // %6
927 :
928 : "memory", "cc", "v0", "v1");
929 }
930
931 #undef LOAD1_DATA32_LANE
932
933 // TODO(Yang Zhang): Investigate less load instructions for
934 // the x/dx stepping
935 #define LOAD2_DATA32_LANE(vn1, vn2, n) \
936 "lsr %5, %3, #16 \n" \
937 "add %6, %1, %5, lsl #2 \n" \
938 "add %3, %3, %4 \n" \
939 "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
940
ScaleARGBFilterCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)941 void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
942 const uint8_t* src_argb,
943 int dst_width,
944 int x,
945 int dx) {
946 int dx_offset[4] = {0, 1, 2, 3};
947 int* tmp = dx_offset;
948 const uint8_t* src_tmp = src_argb;
949 int64_t x64 = (int64_t)x; // NOLINT
950 int64_t dx64 = (int64_t)dx; // NOLINT
951 asm volatile (
952 "dup v0.4s, %w3 \n" // x
953 "dup v1.4s, %w4 \n" // dx
954 "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
955 "shl v6.4s, v1.4s, #2 \n" // 4 * dx
956 "mul v1.4s, v1.4s, v2.4s \n"
957 "movi v3.16b, #0x7f \n" // 0x7F
958 "movi v4.8h, #0x7f \n" // 0x7F
959 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
960 "add v5.4s, v1.4s, v0.4s \n"
961 "1: \n"
962 // d0, d1: a
963 // d2, d3: b
964 LOAD2_DATA32_LANE(v0, v1, 0)
965 LOAD2_DATA32_LANE(v0, v1, 1)
966 LOAD2_DATA32_LANE(v0, v1, 2)
967 LOAD2_DATA32_LANE(v0, v1, 3)
968 "shrn v2.4h, v5.4s, #9 \n"
969 "and v2.8b, v2.8b, v4.8b \n"
970 "dup v16.8b, v2.b[0] \n"
971 "dup v17.8b, v2.b[2] \n"
972 "dup v18.8b, v2.b[4] \n"
973 "dup v19.8b, v2.b[6] \n"
974 "ext v2.8b, v16.8b, v17.8b, #4 \n"
975 "ext v17.8b, v18.8b, v19.8b, #4 \n"
976 "ins v2.d[1], v17.d[0] \n" // f
977 "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f
978 "umull v16.8h, v0.8b, v7.8b \n"
979 "umull2 v17.8h, v0.16b, v7.16b \n"
980 "umull v18.8h, v1.8b, v2.8b \n"
981 "umull2 v19.8h, v1.16b, v2.16b \n"
982 "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
983 "add v16.8h, v16.8h, v18.8h \n"
984 "add v17.8h, v17.8h, v19.8h \n"
985 "shrn v0.8b, v16.8h, #7 \n"
986 "shrn2 v0.16b, v17.8h, #7 \n"
987 "st1 {v0.4s}, [%0], #16 \n" // store pixels
988 "add v5.4s, v5.4s, v6.4s \n"
989 "subs %w2, %w2, #4 \n" // 4 processed per loop
990 "b.gt 1b \n"
991 : "+r"(dst_argb), // %0
992 "+r"(src_argb), // %1
993 "+r"(dst_width), // %2
994 "+r"(x64), // %3
995 "+r"(dx64), // %4
996 "+r"(tmp), // %5
997 "+r"(src_tmp) // %6
998 :
999 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
1000 "v6", "v7", "v16", "v17", "v18", "v19"
1001 );
1002 }
1003
1004 #undef LOAD2_DATA32_LANE
1005
1006 // Read 16x2 average down and write 8x1.
ScaleRowDown2Box_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)1007 void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
1008 ptrdiff_t src_stride,
1009 uint16_t* dst,
1010 int dst_width) {
1011 asm volatile(
1012 // change the stride to row 2 pointer
1013 "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
1014 "1: \n"
1015 "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
1016 "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
1017 "subs %w3, %w3, #8 \n" // 8 processed per loop
1018 "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
1019 "uaddlp v1.4s, v1.8h \n"
1020 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
1021 "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
1022 "uadalp v1.4s, v3.8h \n"
1023 "prfm pldl1keep, [%1, 448] \n"
1024 "rshrn v0.4h, v0.4s, #2 \n" // round and pack
1025 "rshrn2 v0.8h, v1.4s, #2 \n"
1026 "st1 {v0.8h}, [%2], #16 \n"
1027 "b.gt 1b \n"
1028 : "+r"(src_ptr), // %0
1029 "+r"(src_stride), // %1
1030 "+r"(dst), // %2
1031 "+r"(dst_width) // %3
1032 :
1033 : "v0", "v1", "v2", "v3" // Clobber List
1034 );
1035 }
1036
1037 // Read 8x2 upsample with filtering and write 16x1.
1038 // Actually reads an extra pixel, so 9x2.
ScaleRowUp2_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)1039 void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
1040 ptrdiff_t src_stride,
1041 uint16_t* dst,
1042 int dst_width) {
1043 asm volatile(
1044 "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
1045 "movi v0.8h, #9 \n" // constants
1046 "movi v1.4s, #3 \n"
1047
1048 "1: \n"
1049 "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
1050 "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
1051 "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
1052 "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
1053 "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
1054 "umull v16.4s, v3.4h, v0.4h \n"
1055 "umull2 v7.4s, v3.8h, v0.8h \n"
1056 "umull v18.4s, v4.4h, v0.4h \n"
1057 "umull2 v17.4s, v4.8h, v0.8h \n"
1058 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
1059 "uaddw v16.4s, v16.4s, v6.4h \n"
1060 "uaddl2 v19.4s, v6.8h, v3.8h \n"
1061 "uaddl v3.4s, v6.4h, v3.4h \n"
1062 "uaddw2 v6.4s, v7.4s, v6.8h \n"
1063 "uaddl2 v7.4s, v5.8h, v4.8h \n"
1064 "uaddl v4.4s, v5.4h, v4.4h \n"
1065 "uaddw v18.4s, v18.4s, v5.4h \n"
1066 "prfm pldl1keep, [%1, 448] \n"
1067 "mla v16.4s, v4.4s, v1.4s \n"
1068 "mla v18.4s, v3.4s, v1.4s \n"
1069 "mla v6.4s, v7.4s, v1.4s \n"
1070 "uaddw2 v4.4s, v17.4s, v5.8h \n"
1071 "uqrshrn v16.4h, v16.4s, #4 \n"
1072 "mla v4.4s, v19.4s, v1.4s \n"
1073 "uqrshrn2 v16.8h, v6.4s, #4 \n"
1074 "uqrshrn v17.4h, v18.4s, #4 \n"
1075 "uqrshrn2 v17.8h, v4.4s, #4 \n"
1076 "st2 {v16.8h-v17.8h}, [%2], #32 \n"
1077 "b.gt 1b \n"
1078 : "+r"(src_ptr), // %0
1079 "+r"(src_stride), // %1
1080 "+r"(dst), // %2
1081 "+r"(dst_width) // %3
1082 : "r"(2LL), // %4
1083 "r"(14LL) // %5
1084 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
1085 "v19" // Clobber List
1086 );
1087 }
1088
ScaleUVRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1089 void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
1090 ptrdiff_t src_stride,
1091 uint8_t* dst,
1092 int dst_width) {
1093 asm volatile(
1094 // change the stride to row 2 pointer
1095 "add %1, %1, %0 \n"
1096 "1: \n"
1097 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV
1098 "subs %w3, %w3, #8 \n" // 8 processed per loop.
1099 "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts.
1100 "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts.
1101 "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16
1102 "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts.
1103 "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts.
1104 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
1105 "rshrn v0.8b, v0.8h, #2 \n" // round and pack
1106 "prfm pldl1keep, [%1, 448] \n"
1107 "rshrn v1.8b, v1.8h, #2 \n"
1108 "st2 {v0.8b,v1.8b}, [%2], #16 \n"
1109 "b.gt 1b \n"
1110 : "+r"(src_ptr), // %0
1111 "+r"(src_stride), // %1
1112 "+r"(dst), // %2
1113 "+r"(dst_width) // %3
1114 :
1115 : "memory", "cc", "v0", "v1", "v16", "v17");
1116 }
1117
1118 // Reads 4 pixels at a time.
ScaleUVRowDownEven_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_ptr,int dst_width)1119 void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
1120 ptrdiff_t src_stride,
1121 int src_stepx, // pixel step
1122 uint8_t* dst_ptr,
1123 int dst_width) {
1124 const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
1125 const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
1126 const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
1127 (void)src_stride;
1128 asm volatile(
1129 "1: \n"
1130 "ld1 {v0.h}[0], [%0], %6 \n"
1131 "ld1 {v1.h}[0], [%1], %6 \n"
1132 "ld1 {v2.h}[0], [%2], %6 \n"
1133 "ld1 {v3.h}[0], [%3], %6 \n"
1134 "subs %w5, %w5, #4 \n" // 4 pixels per loop.
1135 "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
1136 "b.gt 1b \n"
1137 : "+r"(src_ptr), // %0
1138 "+r"(src1_ptr), // %1
1139 "+r"(src2_ptr), // %2
1140 "+r"(src3_ptr), // %3
1141 "+r"(dst_ptr), // %4
1142 "+r"(dst_width) // %5
1143 : "r"((int64_t)(src_stepx * 8)) // %6
1144 : "memory", "cc", "v0", "v1", "v2", "v3");
1145 }
1146
1147 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
1148
1149 #ifdef __cplusplus
1150 } // extern "C"
1151 } // namespace libyuv
1152 #endif
1153