1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/scale.h"
13 #include "libyuv/scale_row.h"
14
15 #ifdef __cplusplus
16 namespace libyuv {
17 extern "C" {
18 #endif
19
20 // This module is for GCC Neon armv8 64 bit.
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
22
23 // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)24 void ScaleRowDown2_NEON(const uint8_t* src_ptr,
25 ptrdiff_t src_stride,
26 uint8_t* dst,
27 int dst_width) {
28 (void)src_stride;
29 asm volatile(
30 "1: \n"
31 // load even pixels into v0, odd into v1
32 "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
33 "subs %w2, %w2, #16 \n" // 16 processed per loop
34 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
35 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
36 "b.gt 1b \n"
37 : "+r"(src_ptr), // %0
38 "+r"(dst), // %1
39 "+r"(dst_width) // %2
40 :
41 : "v0", "v1" // Clobber List
42 );
43 }
44
45 // Read 32x1 average down and write 16x1.
ScaleRowDown2Linear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)46 void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
47 ptrdiff_t src_stride,
48 uint8_t* dst,
49 int dst_width) {
50 (void)src_stride;
51 asm volatile(
52 "1: \n"
53 // load even pixels into v0, odd into v1
54 "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
55 "subs %w2, %w2, #16 \n" // 16 processed per loop
56 "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
57 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
58 "st1 {v0.16b}, [%1], #16 \n"
59 "b.gt 1b \n"
60 : "+r"(src_ptr), // %0
61 "+r"(dst), // %1
62 "+r"(dst_width) // %2
63 :
64 : "v0", "v1" // Clobber List
65 );
66 }
67
68 // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)69 void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
70 ptrdiff_t src_stride,
71 uint8_t* dst,
72 int dst_width) {
73 asm volatile(
74 // change the stride to row 2 pointer
75 "add %1, %1, %0 \n"
76 "1: \n"
77 "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
78 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
79 "subs %w3, %w3, #16 \n" // 16 processed per loop
80 "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
81 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
82 "uaddlp v1.8h, v1.16b \n"
83 "prfm pldl1keep, [%1, 448] \n"
84 "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
85 "uadalp v1.8h, v3.16b \n"
86 "rshrn v0.8b, v0.8h, #2 \n" // round and pack
87 "rshrn2 v0.16b, v1.8h, #2 \n"
88 "st1 {v0.16b}, [%2], #16 \n"
89 "b.gt 1b \n"
90 : "+r"(src_ptr), // %0
91 "+r"(src_stride), // %1
92 "+r"(dst), // %2
93 "+r"(dst_width) // %3
94 :
95 : "v0", "v1", "v2", "v3" // Clobber List
96 );
97 }
98
ScaleRowDown4_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)99 void ScaleRowDown4_NEON(const uint8_t* src_ptr,
100 ptrdiff_t src_stride,
101 uint8_t* dst_ptr,
102 int dst_width) {
103 (void)src_stride;
104 asm volatile(
105 "1: \n"
106 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
107 "subs %w2, %w2, #8 \n" // 8 processed per loop
108 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
109 "st1 {v2.8b}, [%1], #8 \n"
110 "b.gt 1b \n"
111 : "+r"(src_ptr), // %0
112 "+r"(dst_ptr), // %1
113 "+r"(dst_width) // %2
114 :
115 : "v0", "v1", "v2", "v3", "memory", "cc");
116 }
117
ScaleRowDown4Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)118 void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
119 ptrdiff_t src_stride,
120 uint8_t* dst_ptr,
121 int dst_width) {
122 const uint8_t* src_ptr1 = src_ptr + src_stride;
123 const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
124 const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
125 asm volatile(
126 "1: \n"
127 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
128 "ld1 {v1.16b}, [%2], #16 \n"
129 "ld1 {v2.16b}, [%3], #16 \n"
130 "ld1 {v3.16b}, [%4], #16 \n"
131 "subs %w5, %w5, #4 \n"
132 "uaddlp v0.8h, v0.16b \n"
133 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
134 "uadalp v0.8h, v1.16b \n"
135 "prfm pldl1keep, [%2, 448] \n"
136 "uadalp v0.8h, v2.16b \n"
137 "prfm pldl1keep, [%3, 448] \n"
138 "uadalp v0.8h, v3.16b \n"
139 "prfm pldl1keep, [%4, 448] \n"
140 "addp v0.8h, v0.8h, v0.8h \n"
141 "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
142 "st1 {v0.s}[0], [%1], #4 \n"
143 "b.gt 1b \n"
144 : "+r"(src_ptr), // %0
145 "+r"(dst_ptr), // %1
146 "+r"(src_ptr1), // %2
147 "+r"(src_ptr2), // %3
148 "+r"(src_ptr3), // %4
149 "+r"(dst_width) // %5
150 :
151 : "v0", "v1", "v2", "v3", "memory", "cc");
152 }
153
154 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
155 // to load up the every 4th pixel into a 4 different registers.
156 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)157 void ScaleRowDown34_NEON(const uint8_t* src_ptr,
158 ptrdiff_t src_stride,
159 uint8_t* dst_ptr,
160 int dst_width) {
161 (void)src_stride;
162 asm volatile(
163 "1: \n"
164 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
165 "subs %w2, %w2, #24 \n"
166 "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
167 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
168 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
169 "b.gt 1b \n"
170 : "+r"(src_ptr), // %0
171 "+r"(dst_ptr), // %1
172 "+r"(dst_width) // %2
173 :
174 : "v0", "v1", "v2", "v3", "memory", "cc");
175 }
176
ScaleRowDown34_0_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)177 void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
178 ptrdiff_t src_stride,
179 uint8_t* dst_ptr,
180 int dst_width) {
181 asm volatile(
182 "movi v20.8b, #3 \n"
183 "add %3, %3, %0 \n"
184 "1: \n"
185 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
186 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
187 "subs %w2, %w2, #24 \n"
188
189 // filter src line 0 with src line 1
190 // expand chars to shorts to allow for room
191 // when adding lines together
192 "ushll v16.8h, v4.8b, #0 \n"
193 "ushll v17.8h, v5.8b, #0 \n"
194 "ushll v18.8h, v6.8b, #0 \n"
195 "ushll v19.8h, v7.8b, #0 \n"
196
197 // 3 * line_0 + line_1
198 "umlal v16.8h, v0.8b, v20.8b \n"
199 "umlal v17.8h, v1.8b, v20.8b \n"
200 "umlal v18.8h, v2.8b, v20.8b \n"
201 "umlal v19.8h, v3.8b, v20.8b \n"
202 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
203
204 // (3 * line_0 + line_1 + 2) >> 2
205 "uqrshrn v0.8b, v16.8h, #2 \n"
206 "uqrshrn v1.8b, v17.8h, #2 \n"
207 "uqrshrn v2.8b, v18.8h, #2 \n"
208 "uqrshrn v3.8b, v19.8h, #2 \n"
209 "prfm pldl1keep, [%3, 448] \n"
210
211 // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
212 "ushll v16.8h, v1.8b, #0 \n"
213 "umlal v16.8h, v0.8b, v20.8b \n"
214 "uqrshrn v0.8b, v16.8h, #2 \n"
215
216 // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
217 "urhadd v1.8b, v1.8b, v2.8b \n"
218
219 // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
220 "ushll v16.8h, v2.8b, #0 \n"
221 "umlal v16.8h, v3.8b, v20.8b \n"
222 "uqrshrn v2.8b, v16.8h, #2 \n"
223
224 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
225
226 "b.gt 1b \n"
227 : "+r"(src_ptr), // %0
228 "+r"(dst_ptr), // %1
229 "+r"(dst_width), // %2
230 "+r"(src_stride) // %3
231 :
232 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
233 "v19", "v20", "memory", "cc");
234 }
235
ScaleRowDown34_1_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)236 void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
237 ptrdiff_t src_stride,
238 uint8_t* dst_ptr,
239 int dst_width) {
240 asm volatile(
241 "movi v20.8b, #3 \n"
242 "add %3, %3, %0 \n"
243 "1: \n"
244 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
245 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
246 "subs %w2, %w2, #24 \n"
247 // average src line 0 with src line 1
248 "urhadd v0.8b, v0.8b, v4.8b \n"
249 "urhadd v1.8b, v1.8b, v5.8b \n"
250 "urhadd v2.8b, v2.8b, v6.8b \n"
251 "urhadd v3.8b, v3.8b, v7.8b \n"
252 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
253
254 // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
255 "ushll v4.8h, v1.8b, #0 \n"
256 "umlal v4.8h, v0.8b, v20.8b \n"
257 "uqrshrn v0.8b, v4.8h, #2 \n"
258 "prfm pldl1keep, [%3, 448] \n"
259
260 // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
261 "urhadd v1.8b, v1.8b, v2.8b \n"
262
263 // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
264 "ushll v4.8h, v2.8b, #0 \n"
265 "umlal v4.8h, v3.8b, v20.8b \n"
266 "uqrshrn v2.8b, v4.8h, #2 \n"
267
268 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
269 "b.gt 1b \n"
270 : "+r"(src_ptr), // %0
271 "+r"(dst_ptr), // %1
272 "+r"(dst_width), // %2
273 "+r"(src_stride) // %3
274 :
275 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
276 }
277
278 static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
279 22, 24, 27, 30, 0, 0, 0, 0};
280 static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20,
281 34, 6, 22, 35, 0, 0, 0, 0};
282 static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
283 65536 / 12, 65536 / 12, 65536 / 12,
284 65536 / 12, 65536 / 12};
285 static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
286 65536 / 18, 65536 / 18, 65536 / 18,
287 65536 / 18, 65536 / 18};
288
289 // 32 -> 12
ScaleRowDown38_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)290 void ScaleRowDown38_NEON(const uint8_t* src_ptr,
291 ptrdiff_t src_stride,
292 uint8_t* dst_ptr,
293 int dst_width) {
294 (void)src_stride;
295 asm volatile(
296 "ld1 {v3.16b}, [%3] \n"
297 "1: \n"
298 "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
299 "subs %w2, %w2, #12 \n"
300 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
301 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
302 "st1 {v2.8b}, [%1], #8 \n"
303 "st1 {v2.s}[2], [%1], #4 \n"
304 "b.gt 1b \n"
305 : "+r"(src_ptr), // %0
306 "+r"(dst_ptr), // %1
307 "+r"(dst_width) // %2
308 : "r"(&kShuf38) // %3
309 : "v0", "v1", "v2", "v3", "memory", "cc");
310 }
311
312 // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)313 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
314 ptrdiff_t src_stride,
315 uint8_t* dst_ptr,
316 int dst_width) {
317 const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
318 ptrdiff_t tmp_src_stride = src_stride;
319
320 asm volatile(
321 "ld1 {v29.8h}, [%5] \n"
322 "ld1 {v30.16b}, [%6] \n"
323 "ld1 {v31.8h}, [%7] \n"
324 "add %2, %2, %0 \n"
325 "1: \n"
326
327 // 00 40 01 41 02 42 03 43
328 // 10 50 11 51 12 52 13 53
329 // 20 60 21 61 22 62 23 63
330 // 30 70 31 71 32 72 33 73
331 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
332 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
333 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
334 "subs %w4, %w4, #12 \n"
335
336 // Shuffle the input data around to get align the data
337 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
338 // 00 10 01 11 02 12 03 13
339 // 40 50 41 51 42 52 43 53
340 "trn1 v20.8b, v0.8b, v1.8b \n"
341 "trn2 v21.8b, v0.8b, v1.8b \n"
342 "trn1 v22.8b, v4.8b, v5.8b \n"
343 "trn2 v23.8b, v4.8b, v5.8b \n"
344 "trn1 v24.8b, v16.8b, v17.8b \n"
345 "trn2 v25.8b, v16.8b, v17.8b \n"
346
347 // 20 30 21 31 22 32 23 33
348 // 60 70 61 71 62 72 63 73
349 "trn1 v0.8b, v2.8b, v3.8b \n"
350 "trn2 v1.8b, v2.8b, v3.8b \n"
351 "trn1 v4.8b, v6.8b, v7.8b \n"
352 "trn2 v5.8b, v6.8b, v7.8b \n"
353 "trn1 v16.8b, v18.8b, v19.8b \n"
354 "trn2 v17.8b, v18.8b, v19.8b \n"
355
356 // 00+10 01+11 02+12 03+13
357 // 40+50 41+51 42+52 43+53
358 "uaddlp v20.4h, v20.8b \n"
359 "uaddlp v21.4h, v21.8b \n"
360 "uaddlp v22.4h, v22.8b \n"
361 "uaddlp v23.4h, v23.8b \n"
362 "uaddlp v24.4h, v24.8b \n"
363 "uaddlp v25.4h, v25.8b \n"
364
365 // 60+70 61+71 62+72 63+73
366 "uaddlp v1.4h, v1.8b \n"
367 "uaddlp v5.4h, v5.8b \n"
368 "uaddlp v17.4h, v17.8b \n"
369
370 // combine source lines
371 "add v20.4h, v20.4h, v22.4h \n"
372 "add v21.4h, v21.4h, v23.4h \n"
373 "add v20.4h, v20.4h, v24.4h \n"
374 "add v21.4h, v21.4h, v25.4h \n"
375 "add v2.4h, v1.4h, v5.4h \n"
376 "add v2.4h, v2.4h, v17.4h \n"
377
378 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
379 // + s[6 + st * 1] + s[7 + st * 1]
380 // + s[6 + st * 2] + s[7 + st * 2]) / 6
381 "sqrdmulh v2.8h, v2.8h, v29.8h \n"
382 "xtn v2.8b, v2.8h \n"
383
384 // Shuffle 2,3 reg around so that 2 can be added to the
385 // 0,1 reg and 3 can be added to the 4,5 reg. This
386 // requires expanding from u8 to u16 as the 0,1 and 4,5
387 // registers are already expanded. Then do transposes
388 // to get aligned.
389 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
390 "ushll v16.8h, v16.8b, #0 \n"
391 "uaddl v0.8h, v0.8b, v4.8b \n"
392
393 // combine source lines
394 "add v0.8h, v0.8h, v16.8h \n"
395
396 // xx 20 xx 21 xx 22 xx 23
397 // xx 30 xx 31 xx 32 xx 33
398 "trn1 v1.8h, v0.8h, v0.8h \n"
399 "trn2 v4.8h, v0.8h, v0.8h \n"
400 "xtn v0.4h, v1.4s \n"
401 "xtn v4.4h, v4.4s \n"
402 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
403
404 // 0+1+2, 3+4+5
405 "add v20.8h, v20.8h, v0.8h \n"
406 "add v21.8h, v21.8h, v4.8h \n"
407 "prfm pldl1keep, [%2, 448] \n"
408
409 // Need to divide, but can't downshift as the the value
410 // isn't a power of 2. So multiply by 65536 / n
411 // and take the upper 16 bits.
412 "sqrdmulh v0.8h, v20.8h, v31.8h \n"
413 "sqrdmulh v1.8h, v21.8h, v31.8h \n"
414 "prfm pldl1keep, [%3, 448] \n"
415
416 // Align for table lookup, vtbl requires registers to be adjacent
417 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
418
419 "st1 {v3.8b}, [%1], #8 \n"
420 "st1 {v3.s}[2], [%1], #4 \n"
421 "b.gt 1b \n"
422 : "+r"(src_ptr), // %0
423 "+r"(dst_ptr), // %1
424 "+r"(tmp_src_stride), // %2
425 "+r"(src_ptr1), // %3
426 "+r"(dst_width) // %4
427 : "r"(&kMult38_Div6), // %5
428 "r"(&kShuf38_2), // %6
429 "r"(&kMult38_Div9) // %7
430 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
431 "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
432 "memory", "cc");
433 }
434
435 // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)436 void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
437 ptrdiff_t src_stride,
438 uint8_t* dst_ptr,
439 int dst_width) {
440 // TODO(fbarchard): use src_stride directly for clang 3.5+.
441 ptrdiff_t tmp_src_stride = src_stride;
442 asm volatile(
443 "ld1 {v30.8h}, [%4] \n"
444 "ld1 {v31.16b}, [%5] \n"
445 "add %2, %2, %0 \n"
446 "1: \n"
447
448 // 00 40 01 41 02 42 03 43
449 // 10 50 11 51 12 52 13 53
450 // 20 60 21 61 22 62 23 63
451 // 30 70 31 71 32 72 33 73
452 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
453 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
454 "subs %w3, %w3, #12 \n"
455
456 // Shuffle the input data around to get align the data
457 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
458 // 00 10 01 11 02 12 03 13
459 // 40 50 41 51 42 52 43 53
460 "trn1 v16.8b, v0.8b, v1.8b \n"
461 "trn2 v17.8b, v0.8b, v1.8b \n"
462 "trn1 v18.8b, v4.8b, v5.8b \n"
463 "trn2 v19.8b, v4.8b, v5.8b \n"
464
465 // 20 30 21 31 22 32 23 33
466 // 60 70 61 71 62 72 63 73
467 "trn1 v0.8b, v2.8b, v3.8b \n"
468 "trn2 v1.8b, v2.8b, v3.8b \n"
469 "trn1 v4.8b, v6.8b, v7.8b \n"
470 "trn2 v5.8b, v6.8b, v7.8b \n"
471
472 // 00+10 01+11 02+12 03+13
473 // 40+50 41+51 42+52 43+53
474 "uaddlp v16.4h, v16.8b \n"
475 "uaddlp v17.4h, v17.8b \n"
476 "uaddlp v18.4h, v18.8b \n"
477 "uaddlp v19.4h, v19.8b \n"
478
479 // 60+70 61+71 62+72 63+73
480 "uaddlp v1.4h, v1.8b \n"
481 "uaddlp v5.4h, v5.8b \n"
482
483 // combine source lines
484 "add v16.4h, v16.4h, v18.4h \n"
485 "add v17.4h, v17.4h, v19.4h \n"
486 "add v2.4h, v1.4h, v5.4h \n"
487
488 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
489 "uqrshrn v2.8b, v2.8h, #2 \n"
490
491 // Shuffle 2,3 reg around so that 2 can be added to the
492 // 0,1 reg and 3 can be added to the 4,5 reg. This
493 // requires expanding from u8 to u16 as the 0,1 and 4,5
494 // registers are already expanded. Then do transposes
495 // to get aligned.
496 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
497
498 // combine source lines
499 "uaddl v0.8h, v0.8b, v4.8b \n"
500
501 // xx 20 xx 21 xx 22 xx 23
502 // xx 30 xx 31 xx 32 xx 33
503 "trn1 v1.8h, v0.8h, v0.8h \n"
504 "trn2 v4.8h, v0.8h, v0.8h \n"
505 "xtn v0.4h, v1.4s \n"
506 "xtn v4.4h, v4.4s \n"
507 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
508
509 // 0+1+2, 3+4+5
510 "add v16.8h, v16.8h, v0.8h \n"
511 "add v17.8h, v17.8h, v4.8h \n"
512 "prfm pldl1keep, [%2, 448] \n"
513
514 // Need to divide, but can't downshift as the the value
515 // isn't a power of 2. So multiply by 65536 / n
516 // and take the upper 16 bits.
517 "sqrdmulh v0.8h, v16.8h, v30.8h \n"
518 "sqrdmulh v1.8h, v17.8h, v30.8h \n"
519
520 // Align for table lookup, vtbl requires registers to
521 // be adjacent
522
523 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
524
525 "st1 {v3.8b}, [%1], #8 \n"
526 "st1 {v3.s}[2], [%1], #4 \n"
527 "b.gt 1b \n"
528 : "+r"(src_ptr), // %0
529 "+r"(dst_ptr), // %1
530 "+r"(tmp_src_stride), // %2
531 "+r"(dst_width) // %3
532 : "r"(&kMult38_Div6), // %4
533 "r"(&kShuf38_2) // %5
534 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
535 "v19", "v30", "v31", "memory", "cc");
536 }
537
ScaleRowUp2_Linear_NEON(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)538 void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
539 uint8_t* dst_ptr,
540 int dst_width) {
541 const uint8_t* src_temp = src_ptr + 1;
542 asm volatile(
543 "movi v31.8b, #3 \n"
544
545 "1: \n"
546 "ldr d0, [%0], #8 \n" // 01234567
547 "ldr d1, [%1], #8 \n" // 12345678
548 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
549
550 "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b)
551 "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b)
552
553 "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd)
554 "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even)
555
556 "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd)
557 "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even)
558
559 "st2 {v1.8b, v2.8b}, [%2], #16 \n" // store
560 "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample
561 "b.gt 1b \n"
562 : "+r"(src_ptr), // %0
563 "+r"(src_temp), // %1
564 "+r"(dst_ptr), // %2
565 "+r"(dst_width) // %3
566 :
567 : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List
568 );
569 }
570
ScaleRowUp2_Bilinear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)571 void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
572 ptrdiff_t src_stride,
573 uint8_t* dst_ptr,
574 ptrdiff_t dst_stride,
575 int dst_width) {
576 const uint8_t* src_ptr1 = src_ptr + src_stride;
577 uint8_t* dst_ptr1 = dst_ptr + dst_stride;
578 const uint8_t* src_temp = src_ptr + 1;
579 const uint8_t* src_temp1 = src_ptr1 + 1;
580
581 asm volatile(
582 "movi v31.8b, #3 \n"
583 "movi v30.8h, #3 \n"
584
585 "1: \n"
586 "ldr d0, [%0], #8 \n" // 01234567
587 "ldr d1, [%2], #8 \n" // 12345678
588 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
589
590 "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b)
591 "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b)
592 "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd)
593 "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even)
594
595 "ldr d0, [%1], #8 \n"
596 "ldr d1, [%3], #8 \n"
597 "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
598
599 "ushll v4.8h, v0.8b, #0 \n" // 01234567 (16b)
600 "ushll v5.8h, v1.8b, #0 \n" // 12345678 (16b)
601 "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd)
602 "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even)
603
604 "mov v0.16b, v4.16b \n"
605 "mov v1.16b, v5.16b \n"
606 "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd)
607 "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even)
608 "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd)
609 "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even)
610
611 "rshrn v2.8b, v2.8h, #4 \n" // 2, odd
612 "rshrn v1.8b, v3.8h, #4 \n" // 2, even
613 "rshrn v4.8b, v4.8h, #4 \n" // 1, odd
614 "rshrn v3.8b, v5.8h, #4 \n" // 1, even
615
616 "st2 {v1.8b, v2.8b}, [%5], #16 \n" // store 1
617 "st2 {v3.8b, v4.8b}, [%4], #16 \n" // store 2
618 "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample
619 "b.gt 1b \n"
620 : "+r"(src_ptr), // %0
621 "+r"(src_ptr1), // %1
622 "+r"(src_temp), // %2
623 "+r"(src_temp1), // %3
624 "+r"(dst_ptr), // %4
625 "+r"(dst_ptr1), // %5
626 "+r"(dst_width) // %6
627 :
628 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
629 "v31" // Clobber List
630 );
631 }
632
ScaleRowUp2_Linear_12_NEON(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)633 void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
634 uint16_t* dst_ptr,
635 int dst_width) {
636 const uint16_t* src_temp = src_ptr + 1;
637 asm volatile(
638 "movi v31.8h, #3 \n"
639
640 "1: \n"
641 "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
642 "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
643 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
644
645 "mov v2.16b, v0.16b \n"
646 "mla v0.8h, v1.8h, v31.8h \n" // 3*near+far (odd)
647 "mla v1.8h, v2.8h, v31.8h \n" // 3*near+far (even)
648
649 "urshr v2.8h, v0.8h, #2 \n" // 3/4*near+1/4*far (odd)
650 "urshr v1.8h, v1.8h, #2 \n" // 3/4*near+1/4*far (even)
651
652 "st2 {v1.8h, v2.8h}, [%2], #32 \n" // store
653 "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample
654 "b.gt 1b \n"
655 : "+r"(src_ptr), // %0
656 "+r"(src_temp), // %1
657 "+r"(dst_ptr), // %2
658 "+r"(dst_width) // %3
659 :
660 : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List
661 );
662 }
663
ScaleRowUp2_Bilinear_12_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)664 void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
665 ptrdiff_t src_stride,
666 uint16_t* dst_ptr,
667 ptrdiff_t dst_stride,
668 int dst_width) {
669 const uint16_t* src_ptr1 = src_ptr + src_stride;
670 uint16_t* dst_ptr1 = dst_ptr + dst_stride;
671 const uint16_t* src_temp = src_ptr + 1;
672 const uint16_t* src_temp1 = src_ptr1 + 1;
673
674 asm volatile(
675 "movi v31.8h, #3 \n"
676
677 "1: \n"
678 "ld1 {v2.8h}, [%0], #16 \n" // 01234567 (16b)
679 "ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b)
680 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
681
682 "mov v0.16b, v2.16b \n"
683 "mla v2.8h, v3.8h, v31.8h \n" // 3*near+far (odd)
684 "mla v3.8h, v0.8h, v31.8h \n" // 3*near+far (even)
685
686 "ld1 {v4.8h}, [%1], #16 \n" // 01234567 (16b)
687 "ld1 {v5.8h}, [%3], #16 \n" // 12345678 (16b)
688 "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
689
690 "mov v0.16b, v4.16b \n"
691 "mla v4.8h, v5.8h, v31.8h \n" // 3*near+far (odd)
692 "mla v5.8h, v0.8h, v31.8h \n" // 3*near+far (even)
693
694 "mov v0.16b, v4.16b \n"
695 "mov v1.16b, v5.16b \n"
696 "mla v4.8h, v2.8h, v31.8h \n" // 9 3 3 1 (1, odd)
697 "mla v5.8h, v3.8h, v31.8h \n" // 9 3 3 1 (1, even)
698 "mla v2.8h, v0.8h, v31.8h \n" // 9 3 3 1 (2, odd)
699 "mla v3.8h, v1.8h, v31.8h \n" // 9 3 3 1 (2, even)
700
701 "urshr v2.8h, v2.8h, #4 \n" // 2, odd
702 "urshr v1.8h, v3.8h, #4 \n" // 2, even
703 "urshr v4.8h, v4.8h, #4 \n" // 1, odd
704 "urshr v3.8h, v5.8h, #4 \n" // 1, even
705
706 "st2 {v3.8h, v4.8h}, [%4], #32 \n" // store 1
707 "st2 {v1.8h, v2.8h}, [%5], #32 \n" // store 2
708
709 "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample
710 "b.gt 1b \n"
711 : "+r"(src_ptr), // %0
712 "+r"(src_ptr1), // %1
713 "+r"(src_temp), // %2
714 "+r"(src_temp1), // %3
715 "+r"(dst_ptr), // %4
716 "+r"(dst_ptr1), // %5
717 "+r"(dst_width) // %6
718 :
719 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
720 "v31" // Clobber List
721 );
722 }
723
ScaleRowUp2_Linear_16_NEON(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)724 void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
725 uint16_t* dst_ptr,
726 int dst_width) {
727 const uint16_t* src_temp = src_ptr + 1;
728 asm volatile(
729 "movi v31.8h, #3 \n"
730
731 "1: \n"
732 "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
733 "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
734 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
735
736 "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b)
737 "ushll2 v3.4s, v0.8h, #0 \n" // 4567 (32b)
738 "ushll v4.4s, v1.4h, #0 \n" // 1234 (32b)
739 "ushll2 v5.4s, v1.8h, #0 \n" // 5678 (32b)
740
741 "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd)
742 "umlal2 v3.4s, v1.8h, v31.8h \n" // 3*near+far (2, odd)
743 "umlal v4.4s, v0.4h, v31.4h \n" // 3*near+far (1, even)
744 "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (2, even)
745
746 "rshrn v0.4h, v4.4s, #2 \n" // 3/4*near+1/4*far
747 "rshrn2 v0.8h, v5.4s, #2 \n" // 3/4*near+1/4*far (even)
748 "rshrn v1.4h, v2.4s, #2 \n" // 3/4*near+1/4*far
749 "rshrn2 v1.8h, v3.4s, #2 \n" // 3/4*near+1/4*far (odd)
750
751 "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store
752 "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample
753 "b.gt 1b \n"
754 : "+r"(src_ptr), // %0
755 "+r"(src_temp), // %1
756 "+r"(dst_ptr), // %2
757 "+r"(dst_width) // %3
758 :
759 : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List
760 );
761 }
762
ScaleRowUp2_Bilinear_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)763 void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
764 ptrdiff_t src_stride,
765 uint16_t* dst_ptr,
766 ptrdiff_t dst_stride,
767 int dst_width) {
768 const uint16_t* src_ptr1 = src_ptr + src_stride;
769 uint16_t* dst_ptr1 = dst_ptr + dst_stride;
770 const uint16_t* src_temp = src_ptr + 1;
771 const uint16_t* src_temp1 = src_ptr1 + 1;
772
773 asm volatile(
774 "movi v31.4h, #3 \n"
775 "movi v30.4s, #3 \n"
776
777 "1: \n"
778 "ldr d0, [%0], #8 \n" // 0123 (16b)
779 "ldr d1, [%2], #8 \n" // 1234 (16b)
780 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
781 "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b)
782 "ushll v3.4s, v1.4h, #0 \n" // 1234 (32b)
783 "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd)
784 "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even)
785
786 "ldr d0, [%1], #8 \n" // 0123 (16b)
787 "ldr d1, [%3], #8 \n" // 1234 (16b)
788 "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
789 "ushll v4.4s, v0.4h, #0 \n" // 0123 (32b)
790 "ushll v5.4s, v1.4h, #0 \n" // 1234 (32b)
791 "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd)
792 "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even)
793
794 "mov v0.16b, v4.16b \n"
795 "mov v1.16b, v5.16b \n"
796 "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd)
797 "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even)
798 "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd)
799 "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even)
800
801 "rshrn v1.4h, v4.4s, #4 \n" // 3/4*near+1/4*far
802 "rshrn v0.4h, v5.4s, #4 \n" // 3/4*near+1/4*far
803 "rshrn v5.4h, v2.4s, #4 \n" // 3/4*near+1/4*far
804 "rshrn v4.4h, v3.4s, #4 \n" // 3/4*near+1/4*far
805
806 "st2 {v0.4h, v1.4h}, [%4], #16 \n" // store 1
807 "st2 {v4.4h, v5.4h}, [%5], #16 \n" // store 2
808
809 "subs %w6, %w6, #8 \n" // 4 sample -> 8 sample
810 "b.gt 1b \n"
811 : "+r"(src_ptr), // %0
812 "+r"(src_ptr1), // %1
813 "+r"(src_temp), // %2
814 "+r"(src_temp1), // %3
815 "+r"(dst_ptr), // %4
816 "+r"(dst_ptr1), // %5
817 "+r"(dst_width) // %6
818 :
819 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
820 "v31" // Clobber List
821 );
822 }
823
ScaleUVRowUp2_Linear_NEON(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)824 void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
825 uint8_t* dst_ptr,
826 int dst_width) {
827 const uint8_t* src_temp = src_ptr + 2;
828 asm volatile(
829 "movi v31.8b, #3 \n"
830
831 "1: \n"
832 "ldr d0, [%0], #8 \n" // 00112233 (1u1v)
833 "ldr d1, [%1], #8 \n" // 11223344 (1u1v)
834 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
835
836 "ushll v2.8h, v0.8b, #0 \n" // 00112233 (1u1v, 16b)
837 "ushll v3.8h, v1.8b, #0 \n" // 11223344 (1u1v, 16b)
838
839 "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd)
840 "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even)
841
842 "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd)
843 "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even)
844
845 "st2 {v1.4h, v2.4h}, [%2], #16 \n" // store
846 "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv
847 "b.gt 1b \n"
848 : "+r"(src_ptr), // %0
849 "+r"(src_temp), // %1
850 "+r"(dst_ptr), // %2
851 "+r"(dst_width) // %3
852 :
853 : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List
854 );
855 }
856
ScaleUVRowUp2_Bilinear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)857 void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
858 ptrdiff_t src_stride,
859 uint8_t* dst_ptr,
860 ptrdiff_t dst_stride,
861 int dst_width) {
862 const uint8_t* src_ptr1 = src_ptr + src_stride;
863 uint8_t* dst_ptr1 = dst_ptr + dst_stride;
864 const uint8_t* src_temp = src_ptr + 2;
865 const uint8_t* src_temp1 = src_ptr1 + 2;
866
867 asm volatile(
868 "movi v31.8b, #3 \n"
869 "movi v30.8h, #3 \n"
870
871 "1: \n"
872 "ldr d0, [%0], #8 \n"
873 "ldr d1, [%2], #8 \n"
874 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
875
876 "ushll v2.8h, v0.8b, #0 \n"
877 "ushll v3.8h, v1.8b, #0 \n"
878 "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd)
879 "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even)
880
881 "ldr d0, [%1], #8 \n"
882 "ldr d1, [%3], #8 \n"
883 "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
884
885 "ushll v4.8h, v0.8b, #0 \n"
886 "ushll v5.8h, v1.8b, #0 \n"
887 "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd)
888 "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even)
889
890 "mov v0.16b, v4.16b \n"
891 "mov v1.16b, v5.16b \n"
892 "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd)
893 "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even)
894 "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd)
895 "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even)
896
897 "rshrn v2.8b, v2.8h, #4 \n" // 2, odd
898 "rshrn v1.8b, v3.8h, #4 \n" // 2, even
899 "rshrn v4.8b, v4.8h, #4 \n" // 1, odd
900 "rshrn v3.8b, v5.8h, #4 \n" // 1, even
901
902 "st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 2
903 "st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 1
904 "subs %w6, %w6, #8 \n" // 4 uv -> 8 uv
905 "b.gt 1b \n"
906 : "+r"(src_ptr), // %0
907 "+r"(src_ptr1), // %1
908 "+r"(src_temp), // %2
909 "+r"(src_temp1), // %3
910 "+r"(dst_ptr), // %4
911 "+r"(dst_ptr1), // %5
912 "+r"(dst_width) // %6
913 :
914 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
915 "v31" // Clobber List
916 );
917 }
918
ScaleUVRowUp2_Linear_16_NEON(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)919 void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
920 uint16_t* dst_ptr,
921 int dst_width) {
922 const uint16_t* src_temp = src_ptr + 2;
923 asm volatile(
924 "movi v31.8h, #3 \n"
925
926 "1: \n"
927 "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
928 "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
929 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
930
931 "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b)
932 "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b)
933 "ushll2 v4.4s, v0.8h, #0 \n" // 2233 (1u1v, 32b)
934 "ushll2 v5.4s, v1.8h, #0 \n" // 3344 (1u1v, 32b)
935
936 "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (odd)
937 "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (even)
938 "umlal2 v4.4s, v1.8h, v31.8h \n" // 3*near+far (odd)
939 "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (even)
940
941 "rshrn v2.4h, v2.4s, #2 \n" // 3/4*near+1/4*far (odd)
942 "rshrn v1.4h, v3.4s, #2 \n" // 3/4*near+1/4*far (even)
943 "rshrn v4.4h, v4.4s, #2 \n" // 3/4*near+1/4*far (odd)
944 "rshrn v3.4h, v5.4s, #2 \n" // 3/4*near+1/4*far (even)
945
946 "st2 {v1.2s, v2.2s}, [%2], #16 \n" // store
947 "st2 {v3.2s, v4.2s}, [%2], #16 \n" // store
948 "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv
949 "b.gt 1b \n"
950 : "+r"(src_ptr), // %0
951 "+r"(src_temp), // %1
952 "+r"(dst_ptr), // %2
953 "+r"(dst_width) // %3
954 :
955 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
956 "v31" // Clobber List
957 );
958 }
959
ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)960 void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
961 ptrdiff_t src_stride,
962 uint16_t* dst_ptr,
963 ptrdiff_t dst_stride,
964 int dst_width) {
965 const uint16_t* src_ptr1 = src_ptr + src_stride;
966 uint16_t* dst_ptr1 = dst_ptr + dst_stride;
967 const uint16_t* src_temp = src_ptr + 2;
968 const uint16_t* src_temp1 = src_ptr1 + 2;
969
970 asm volatile(
971 "movi v31.4h, #3 \n"
972 "movi v30.4s, #3 \n"
973
974 "1: \n"
975 "ldr d0, [%0], #8 \n"
976 "ldr d1, [%2], #8 \n"
977 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
978 "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b)
979 "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b)
980 "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd)
981 "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even)
982
983 "ldr d0, [%1], #8 \n"
984 "ldr d1, [%3], #8 \n"
985 "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
986 "ushll v4.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b)
987 "ushll v5.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b)
988 "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd)
989 "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even)
990
991 "mov v0.16b, v4.16b \n"
992 "mov v1.16b, v5.16b \n"
993 "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd)
994 "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even)
995 "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd)
996 "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even)
997
998 "rshrn v1.4h, v2.4s, #4 \n" // 2, odd
999 "rshrn v0.4h, v3.4s, #4 \n" // 2, even
1000 "rshrn v3.4h, v4.4s, #4 \n" // 1, odd
1001 "rshrn v2.4h, v5.4s, #4 \n" // 1, even
1002
1003 "st2 {v0.2s, v1.2s}, [%5], #16 \n" // store 2
1004 "st2 {v2.2s, v3.2s}, [%4], #16 \n" // store 1
1005 "subs %w6, %w6, #4 \n" // 2 uv -> 4 uv
1006 "b.gt 1b \n"
1007 : "+r"(src_ptr), // %0
1008 "+r"(src_ptr1), // %1
1009 "+r"(src_temp), // %2
1010 "+r"(src_temp1), // %3
1011 "+r"(dst_ptr), // %4
1012 "+r"(dst_ptr1), // %5
1013 "+r"(dst_width) // %6
1014 :
1015 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
1016 "v31" // Clobber List
1017 );
1018 }
1019
1020 // Add a row of bytes to a row of shorts. Used for box filter.
1021 // Reads 16 bytes and accumulates to 16 shorts at a time.
ScaleAddRow_NEON(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1022 void ScaleAddRow_NEON(const uint8_t* src_ptr,
1023 uint16_t* dst_ptr,
1024 int src_width) {
1025 asm volatile(
1026 "1: \n"
1027 "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
1028 "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
1029 "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
1030 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
1031 "uaddw v1.8h, v1.8h, v0.8b \n"
1032 "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
1033 "subs %w2, %w2, #16 \n" // 16 processed per loop
1034 "b.gt 1b \n"
1035 : "+r"(src_ptr), // %0
1036 "+r"(dst_ptr), // %1
1037 "+r"(src_width) // %2
1038 :
1039 : "memory", "cc", "v0", "v1", "v2" // Clobber List
1040 );
1041 }
1042
1043 // TODO(Yang Zhang): Investigate less load instructions for
1044 // the x/dx stepping
1045 #define LOAD2_DATA8_LANE(n) \
1046 "lsr %5, %3, #16 \n" \
1047 "add %6, %1, %5 \n" \
1048 "add %3, %3, %4 \n" \
1049 "ld2 {v4.b, v5.b}[" #n "], [%6] \n"
1050
1051 // The NEON version mimics this formula (from row_common.cc):
1052 // #define BLENDER(a, b, f) (uint8_t)((int)(a) +
1053 // ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
1054
ScaleFilterCols_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1055 void ScaleFilterCols_NEON(uint8_t* dst_ptr,
1056 const uint8_t* src_ptr,
1057 int dst_width,
1058 int x,
1059 int dx) {
1060 int dx_offset[4] = {0, 1, 2, 3};
1061 int* tmp = dx_offset;
1062 const uint8_t* src_tmp = src_ptr;
1063 int64_t x64 = (int64_t)x; // NOLINT
1064 int64_t dx64 = (int64_t)dx; // NOLINT
1065 asm volatile (
1066 "dup v0.4s, %w3 \n" // x
1067 "dup v1.4s, %w4 \n" // dx
1068 "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
1069 "shl v3.4s, v1.4s, #2 \n" // 4 * dx
1070 "mul v1.4s, v1.4s, v2.4s \n"
1071 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
1072 "add v1.4s, v1.4s, v0.4s \n"
1073 // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
1074 "add v2.4s, v1.4s, v3.4s \n"
1075 "shl v0.4s, v3.4s, #1 \n" // 8 * dx
1076 "1: \n"
1077 LOAD2_DATA8_LANE(0)
1078 LOAD2_DATA8_LANE(1)
1079 LOAD2_DATA8_LANE(2)
1080 LOAD2_DATA8_LANE(3)
1081 LOAD2_DATA8_LANE(4)
1082 LOAD2_DATA8_LANE(5)
1083 LOAD2_DATA8_LANE(6)
1084 LOAD2_DATA8_LANE(7)
1085 "mov v6.16b, v1.16b \n"
1086 "mov v7.16b, v2.16b \n"
1087 "uzp1 v6.8h, v6.8h, v7.8h \n"
1088 "ushll v4.8h, v4.8b, #0 \n"
1089 "ushll v5.8h, v5.8b, #0 \n"
1090 "ssubl v16.4s, v5.4h, v4.4h \n"
1091 "ssubl2 v17.4s, v5.8h, v4.8h \n"
1092 "ushll v7.4s, v6.4h, #0 \n"
1093 "ushll2 v6.4s, v6.8h, #0 \n"
1094 "mul v16.4s, v16.4s, v7.4s \n"
1095 "mul v17.4s, v17.4s, v6.4s \n"
1096 "rshrn v6.4h, v16.4s, #16 \n"
1097 "rshrn2 v6.8h, v17.4s, #16 \n"
1098 "add v4.8h, v4.8h, v6.8h \n"
1099 "xtn v4.8b, v4.8h \n"
1100
1101 "st1 {v4.8b}, [%0], #8 \n" // store pixels
1102 "add v1.4s, v1.4s, v0.4s \n"
1103 "add v2.4s, v2.4s, v0.4s \n"
1104 "subs %w2, %w2, #8 \n" // 8 processed per loop
1105 "b.gt 1b \n"
1106 : "+r"(dst_ptr), // %0
1107 "+r"(src_ptr), // %1
1108 "+r"(dst_width), // %2
1109 "+r"(x64), // %3
1110 "+r"(dx64), // %4
1111 "+r"(tmp), // %5
1112 "+r"(src_tmp) // %6
1113 :
1114 : "memory", "cc", "v0", "v1", "v2", "v3",
1115 "v4", "v5", "v6", "v7", "v16", "v17"
1116 );
1117 }
1118
1119 #undef LOAD2_DATA8_LANE
1120
ScaleARGBRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1121 void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
1122 ptrdiff_t src_stride,
1123 uint8_t* dst,
1124 int dst_width) {
1125 (void)src_stride;
1126 asm volatile(
1127 "1: \n"
1128 // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
1129 "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
1130 "subs %w2, %w2, #8 \n" // 8 processed per loop
1131 "mov v2.16b, v3.16b \n"
1132 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
1133 "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
1134 "b.gt 1b \n"
1135 : "+r"(src_ptr), // %0
1136 "+r"(dst), // %1
1137 "+r"(dst_width) // %2
1138 :
1139 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
1140 );
1141 }
1142
ScaleARGBRowDown2Linear_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1143 void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
1144 ptrdiff_t src_stride,
1145 uint8_t* dst_argb,
1146 int dst_width) {
1147 (void)src_stride;
1148 asm volatile(
1149 "1: \n"
1150 // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
1151 "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
1152 "subs %w2, %w2, #8 \n" // 8 processed per loop
1153
1154 "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
1155 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
1156 "urhadd v1.16b, v2.16b, v3.16b \n"
1157 "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
1158 "b.gt 1b \n"
1159 : "+r"(src_argb), // %0
1160 "+r"(dst_argb), // %1
1161 "+r"(dst_width) // %2
1162 :
1163 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
1164 );
1165 }
1166
ScaleARGBRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1167 void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
1168 ptrdiff_t src_stride,
1169 uint8_t* dst,
1170 int dst_width) {
1171 asm volatile(
1172 // change the stride to row 2 pointer
1173 "add %1, %1, %0 \n"
1174 "1: \n"
1175 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
1176 "subs %w3, %w3, #8 \n" // 8 processed per loop.
1177 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1178 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1179 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1180 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
1181 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
1182 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
1183 "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
1184 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
1185 "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
1186 "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
1187 "prfm pldl1keep, [%1, 448] \n"
1188 "rshrn v0.8b, v0.8h, #2 \n" // round and pack
1189 "rshrn v1.8b, v1.8h, #2 \n"
1190 "rshrn v2.8b, v2.8h, #2 \n"
1191 "rshrn v3.8b, v3.8h, #2 \n"
1192 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
1193 "b.gt 1b \n"
1194 : "+r"(src_ptr), // %0
1195 "+r"(src_stride), // %1
1196 "+r"(dst), // %2
1197 "+r"(dst_width) // %3
1198 :
1199 : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
1200 }
1201
1202 // Reads 4 pixels at a time.
1203 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1204 void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
1205 ptrdiff_t src_stride,
1206 int src_stepx,
1207 uint8_t* dst_argb,
1208 int dst_width) {
1209 (void)src_stride;
1210 asm volatile(
1211 "1: \n"
1212 "ld1 {v0.s}[0], [%0], %3 \n"
1213 "ld1 {v0.s}[1], [%0], %3 \n"
1214 "ld1 {v0.s}[2], [%0], %3 \n"
1215 "ld1 {v0.s}[3], [%0], %3 \n"
1216 "subs %w2, %w2, #4 \n" // 4 pixels per loop.
1217 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
1218 "st1 {v0.16b}, [%1], #16 \n"
1219 "b.gt 1b \n"
1220 : "+r"(src_argb), // %0
1221 "+r"(dst_argb), // %1
1222 "+r"(dst_width) // %2
1223 : "r"((int64_t)(src_stepx * 4)) // %3
1224 : "memory", "cc", "v0");
1225 }
1226
1227 // Reads 4 pixels at a time.
1228 // Alignment requirement: src_argb 4 byte aligned.
1229 // TODO(Yang Zhang): Might be worth another optimization pass in future.
1230 // It could be upgraded to 8 pixels at a time to start with.
ScaleARGBRowDownEvenBox_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1231 void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
1232 ptrdiff_t src_stride,
1233 int src_stepx,
1234 uint8_t* dst_argb,
1235 int dst_width) {
1236 asm volatile(
1237 "add %1, %1, %0 \n"
1238 "1: \n"
1239 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
1240 "ld1 {v1.8b}, [%1], %4 \n"
1241 "ld1 {v2.8b}, [%0], %4 \n"
1242 "ld1 {v3.8b}, [%1], %4 \n"
1243 "ld1 {v4.8b}, [%0], %4 \n"
1244 "ld1 {v5.8b}, [%1], %4 \n"
1245 "ld1 {v6.8b}, [%0], %4 \n"
1246 "ld1 {v7.8b}, [%1], %4 \n"
1247 "uaddl v0.8h, v0.8b, v1.8b \n"
1248 "uaddl v2.8h, v2.8b, v3.8b \n"
1249 "uaddl v4.8h, v4.8b, v5.8b \n"
1250 "uaddl v6.8h, v6.8b, v7.8b \n"
1251 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
1252 "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
1253 "mov v0.d[1], v2.d[0] \n"
1254 "mov v2.d[0], v16.d[1] \n"
1255 "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
1256 "mov v4.d[1], v6.d[0] \n"
1257 "mov v6.d[0], v16.d[1] \n"
1258 "prfm pldl1keep, [%1, 448] \n"
1259 "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
1260 "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
1261 "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
1262 "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
1263 "subs %w3, %w3, #4 \n" // 4 pixels per loop.
1264 "st1 {v0.16b}, [%2], #16 \n"
1265 "b.gt 1b \n"
1266 : "+r"(src_argb), // %0
1267 "+r"(src_stride), // %1
1268 "+r"(dst_argb), // %2
1269 "+r"(dst_width) // %3
1270 : "r"((int64_t)(src_stepx * 4)) // %4
1271 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
1272 }
1273
1274 // TODO(Yang Zhang): Investigate less load instructions for
1275 // the x/dx stepping
1276 #define LOAD1_DATA32_LANE(vn, n) \
1277 "lsr %5, %3, #16 \n" \
1278 "add %6, %1, %5, lsl #2 \n" \
1279 "add %3, %3, %4 \n" \
1280 "ld1 {" #vn ".s}[" #n "], [%6] \n"
1281
ScaleARGBCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1282 void ScaleARGBCols_NEON(uint8_t* dst_argb,
1283 const uint8_t* src_argb,
1284 int dst_width,
1285 int x,
1286 int dx) {
1287 const uint8_t* src_tmp = src_argb;
1288 int64_t x64 = (int64_t)x; // NOLINT
1289 int64_t dx64 = (int64_t)dx; // NOLINT
1290 int64_t tmp64;
1291 asm volatile(
1292 "1: \n"
1293 // clang-format off
1294 LOAD1_DATA32_LANE(v0, 0)
1295 LOAD1_DATA32_LANE(v0, 1)
1296 LOAD1_DATA32_LANE(v0, 2)
1297 LOAD1_DATA32_LANE(v0, 3)
1298 LOAD1_DATA32_LANE(v1, 0)
1299 LOAD1_DATA32_LANE(v1, 1)
1300 LOAD1_DATA32_LANE(v1, 2)
1301 LOAD1_DATA32_LANE(v1, 3)
1302 "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
1303 // clang-format on
1304 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
1305 "subs %w2, %w2, #8 \n" // 8 processed per loop
1306 "b.gt 1b \n"
1307 : "+r"(dst_argb), // %0
1308 "+r"(src_argb), // %1
1309 "+r"(dst_width), // %2
1310 "+r"(x64), // %3
1311 "+r"(dx64), // %4
1312 "=&r"(tmp64), // %5
1313 "+r"(src_tmp) // %6
1314 :
1315 : "memory", "cc", "v0", "v1");
1316 }
1317
1318 #undef LOAD1_DATA32_LANE
1319
1320 // TODO(Yang Zhang): Investigate less load instructions for
1321 // the x/dx stepping
1322 #define LOAD2_DATA32_LANE(vn1, vn2, n) \
1323 "lsr %5, %3, #16 \n" \
1324 "add %6, %1, %5, lsl #2 \n" \
1325 "add %3, %3, %4 \n" \
1326 "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
1327
ScaleARGBFilterCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1328 void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
1329 const uint8_t* src_argb,
1330 int dst_width,
1331 int x,
1332 int dx) {
1333 int dx_offset[4] = {0, 1, 2, 3};
1334 int* tmp = dx_offset;
1335 const uint8_t* src_tmp = src_argb;
1336 int64_t x64 = (int64_t)x; // NOLINT
1337 int64_t dx64 = (int64_t)dx; // NOLINT
1338 asm volatile (
1339 "dup v0.4s, %w3 \n" // x
1340 "dup v1.4s, %w4 \n" // dx
1341 "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
1342 "shl v6.4s, v1.4s, #2 \n" // 4 * dx
1343 "mul v1.4s, v1.4s, v2.4s \n"
1344 "movi v3.16b, #0x7f \n" // 0x7F
1345 "movi v4.8h, #0x7f \n" // 0x7F
1346 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
1347 "add v5.4s, v1.4s, v0.4s \n"
1348 "1: \n"
1349 // d0, d1: a
1350 // d2, d3: b
1351 LOAD2_DATA32_LANE(v0, v1, 0)
1352 LOAD2_DATA32_LANE(v0, v1, 1)
1353 LOAD2_DATA32_LANE(v0, v1, 2)
1354 LOAD2_DATA32_LANE(v0, v1, 3)
1355 "shrn v2.4h, v5.4s, #9 \n"
1356 "and v2.8b, v2.8b, v4.8b \n"
1357 "dup v16.8b, v2.b[0] \n"
1358 "dup v17.8b, v2.b[2] \n"
1359 "dup v18.8b, v2.b[4] \n"
1360 "dup v19.8b, v2.b[6] \n"
1361 "ext v2.8b, v16.8b, v17.8b, #4 \n"
1362 "ext v17.8b, v18.8b, v19.8b, #4 \n"
1363 "ins v2.d[1], v17.d[0] \n" // f
1364 "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f
1365 "umull v16.8h, v0.8b, v7.8b \n"
1366 "umull2 v17.8h, v0.16b, v7.16b \n"
1367 "umull v18.8h, v1.8b, v2.8b \n"
1368 "umull2 v19.8h, v1.16b, v2.16b \n"
1369 "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
1370 "add v16.8h, v16.8h, v18.8h \n"
1371 "add v17.8h, v17.8h, v19.8h \n"
1372 "shrn v0.8b, v16.8h, #7 \n"
1373 "shrn2 v0.16b, v17.8h, #7 \n"
1374 "st1 {v0.4s}, [%0], #16 \n" // store pixels
1375 "add v5.4s, v5.4s, v6.4s \n"
1376 "subs %w2, %w2, #4 \n" // 4 processed per loop
1377 "b.gt 1b \n"
1378 : "+r"(dst_argb), // %0
1379 "+r"(src_argb), // %1
1380 "+r"(dst_width), // %2
1381 "+r"(x64), // %3
1382 "+r"(dx64), // %4
1383 "+r"(tmp), // %5
1384 "+r"(src_tmp) // %6
1385 :
1386 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
1387 "v6", "v7", "v16", "v17", "v18", "v19"
1388 );
1389 }
1390
1391 #undef LOAD2_DATA32_LANE
1392
1393 // Read 16x2 average down and write 8x1.
ScaleRowDown2Box_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)1394 void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
1395 ptrdiff_t src_stride,
1396 uint16_t* dst,
1397 int dst_width) {
1398 asm volatile(
1399 // change the stride to row 2 pointer
1400 "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
1401 "1: \n"
1402 "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
1403 "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
1404 "subs %w3, %w3, #8 \n" // 8 processed per loop
1405 "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
1406 "uaddlp v1.4s, v1.8h \n"
1407 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
1408 "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
1409 "uadalp v1.4s, v3.8h \n"
1410 "prfm pldl1keep, [%1, 448] \n"
1411 "rshrn v0.4h, v0.4s, #2 \n" // round and pack
1412 "rshrn2 v0.8h, v1.4s, #2 \n"
1413 "st1 {v0.8h}, [%2], #16 \n"
1414 "b.gt 1b \n"
1415 : "+r"(src_ptr), // %0
1416 "+r"(src_stride), // %1
1417 "+r"(dst), // %2
1418 "+r"(dst_width) // %3
1419 :
1420 : "v0", "v1", "v2", "v3" // Clobber List
1421 );
1422 }
1423
1424 // Read 8x2 upsample with filtering and write 16x1.
1425 // Actually reads an extra pixel, so 9x2.
ScaleRowUp2_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)1426 void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
1427 ptrdiff_t src_stride,
1428 uint16_t* dst,
1429 int dst_width) {
1430 asm volatile(
1431 "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
1432 "movi v0.8h, #9 \n" // constants
1433 "movi v1.4s, #3 \n"
1434
1435 "1: \n"
1436 "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
1437 "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
1438 "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
1439 "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
1440 "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
1441 "umull v16.4s, v3.4h, v0.4h \n"
1442 "umull2 v7.4s, v3.8h, v0.8h \n"
1443 "umull v18.4s, v4.4h, v0.4h \n"
1444 "umull2 v17.4s, v4.8h, v0.8h \n"
1445 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
1446 "uaddw v16.4s, v16.4s, v6.4h \n"
1447 "uaddl2 v19.4s, v6.8h, v3.8h \n"
1448 "uaddl v3.4s, v6.4h, v3.4h \n"
1449 "uaddw2 v6.4s, v7.4s, v6.8h \n"
1450 "uaddl2 v7.4s, v5.8h, v4.8h \n"
1451 "uaddl v4.4s, v5.4h, v4.4h \n"
1452 "uaddw v18.4s, v18.4s, v5.4h \n"
1453 "prfm pldl1keep, [%1, 448] \n"
1454 "mla v16.4s, v4.4s, v1.4s \n"
1455 "mla v18.4s, v3.4s, v1.4s \n"
1456 "mla v6.4s, v7.4s, v1.4s \n"
1457 "uaddw2 v4.4s, v17.4s, v5.8h \n"
1458 "uqrshrn v16.4h, v16.4s, #4 \n"
1459 "mla v4.4s, v19.4s, v1.4s \n"
1460 "uqrshrn2 v16.8h, v6.4s, #4 \n"
1461 "uqrshrn v17.4h, v18.4s, #4 \n"
1462 "uqrshrn2 v17.8h, v4.4s, #4 \n"
1463 "st2 {v16.8h-v17.8h}, [%2], #32 \n"
1464 "b.gt 1b \n"
1465 : "+r"(src_ptr), // %0
1466 "+r"(src_stride), // %1
1467 "+r"(dst), // %2
1468 "+r"(dst_width) // %3
1469 : "r"(2LL), // %4
1470 "r"(14LL) // %5
1471 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
1472 "v19" // Clobber List
1473 );
1474 }
1475
ScaleUVRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1476 void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
1477 ptrdiff_t src_stride,
1478 uint8_t* dst,
1479 int dst_width) {
1480 (void)src_stride;
1481 asm volatile(
1482 "1: \n"
1483 "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
1484 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1485 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
1486 "st1 {v1.8h}, [%1], #16 \n" // store 8 UV
1487 "b.gt 1b \n"
1488 : "+r"(src_ptr), // %0
1489 "+r"(dst), // %1
1490 "+r"(dst_width) // %2
1491 :
1492 : "memory", "cc", "v0", "v1");
1493 }
1494
ScaleUVRowDown2Linear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1495 void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
1496 ptrdiff_t src_stride,
1497 uint8_t* dst,
1498 int dst_width) {
1499 (void)src_stride;
1500 asm volatile(
1501 "1: \n"
1502 "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
1503 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1504 "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
1505 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
1506 "st1 {v0.8h}, [%1], #16 \n" // store 8 UV
1507 "b.gt 1b \n"
1508 : "+r"(src_ptr), // %0
1509 "+r"(dst), // %1
1510 "+r"(dst_width) // %2
1511 :
1512 : "memory", "cc", "v0", "v1");
1513 }
1514
ScaleUVRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1515 void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
1516 ptrdiff_t src_stride,
1517 uint8_t* dst,
1518 int dst_width) {
1519 asm volatile(
1520 // change the stride to row 2 pointer
1521 "add %1, %1, %0 \n"
1522 "1: \n"
1523 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV
1524 "subs %w3, %w3, #8 \n" // 8 processed per loop.
1525 "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts.
1526 "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts.
1527 "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16
1528 "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts.
1529 "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts.
1530 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
1531 "rshrn v0.8b, v0.8h, #2 \n" // round and pack
1532 "prfm pldl1keep, [%1, 448] \n"
1533 "rshrn v1.8b, v1.8h, #2 \n"
1534 "st2 {v0.8b,v1.8b}, [%2], #16 \n"
1535 "b.gt 1b \n"
1536 : "+r"(src_ptr), // %0
1537 "+r"(src_stride), // %1
1538 "+r"(dst), // %2
1539 "+r"(dst_width) // %3
1540 :
1541 : "memory", "cc", "v0", "v1", "v16", "v17");
1542 }
1543
1544 // Reads 4 pixels at a time.
ScaleUVRowDownEven_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_ptr,int dst_width)1545 void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
1546 ptrdiff_t src_stride,
1547 int src_stepx, // pixel step
1548 uint8_t* dst_ptr,
1549 int dst_width) {
1550 const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
1551 const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
1552 const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
1553 (void)src_stride;
1554 asm volatile(
1555 "1: \n"
1556 "ld1 {v0.h}[0], [%0], %6 \n"
1557 "ld1 {v1.h}[0], [%1], %6 \n"
1558 "ld1 {v2.h}[0], [%2], %6 \n"
1559 "ld1 {v3.h}[0], [%3], %6 \n"
1560 "subs %w5, %w5, #4 \n" // 4 pixels per loop.
1561 "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
1562 "b.gt 1b \n"
1563 : "+r"(src_ptr), // %0
1564 "+r"(src1_ptr), // %1
1565 "+r"(src2_ptr), // %2
1566 "+r"(src3_ptr), // %3
1567 "+r"(dst_ptr), // %4
1568 "+r"(dst_width) // %5
1569 : "r"((int64_t)(src_stepx * 8)) // %6
1570 : "memory", "cc", "v0", "v1", "v2", "v3");
1571 }
1572
1573 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
1574
1575 #ifdef __cplusplus
1576 } // extern "C"
1577 } // namespace libyuv
1578 #endif
1579