1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/scale.h"
12 #include "libyuv/row.h"
13 #include "libyuv/scale_row.h"
14
15 #ifdef __cplusplus
16 namespace libyuv {
17 extern "C" {
18 #endif
19
20 // This module is for GCC Neon armv8 64 bit.
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
22
23 // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)24 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
25 uint8* dst, int dst_width) {
26 asm volatile (
27 "1: \n"
28 // load even pixels into v0, odd into v1
29 MEMACCESS(0)
30 "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
31 "subs %w2, %w2, #16 \n" // 16 processed per loop
32 MEMACCESS(1)
33 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
34 "b.gt 1b \n"
35 : "+r"(src_ptr), // %0
36 "+r"(dst), // %1
37 "+r"(dst_width) // %2
38 :
39 : "v0", "v1" // Clobber List
40 );
41 }
42
43 // Read 32x1 average down and write 16x1.
ScaleRowDown2Linear_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)44 void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
45 uint8* dst, int dst_width) {
46 asm volatile (
47 "1: \n"
48 MEMACCESS(0)
49 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc
50 "subs %w2, %w2, #16 \n" // 16 processed per loop
51 "uaddlp v0.8h, v0.16b \n" // add adjacent
52 "uaddlp v1.8h, v1.16b \n"
53 "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
54 "rshrn2 v0.16b, v1.8h, #1 \n"
55 MEMACCESS(1)
56 "st1 {v0.16b}, [%1], #16 \n"
57 "b.gt 1b \n"
58 : "+r"(src_ptr), // %0
59 "+r"(dst), // %1
60 "+r"(dst_width) // %2
61 :
62 : "v0", "v1" // Clobber List
63 );
64 }
65
66 // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)67 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
68 uint8* dst, int dst_width) {
69 asm volatile (
70 // change the stride to row 2 pointer
71 "add %1, %1, %0 \n"
72 "1: \n"
73 MEMACCESS(0)
74 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc
75 MEMACCESS(1)
76 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
77 "subs %w3, %w3, #16 \n" // 16 processed per loop
78 "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
79 "uaddlp v1.8h, v1.16b \n"
80 "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1
81 "uadalp v1.8h, v3.16b \n"
82 "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
83 "rshrn2 v0.16b, v1.8h, #2 \n"
84 MEMACCESS(2)
85 "st1 {v0.16b}, [%2], #16 \n"
86 "b.gt 1b \n"
87 : "+r"(src_ptr), // %0
88 "+r"(src_stride), // %1
89 "+r"(dst), // %2
90 "+r"(dst_width) // %3
91 :
92 : "v0", "v1", "v2", "v3" // Clobber List
93 );
94 }
95
ScaleRowDown4_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)96 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
97 uint8* dst_ptr, int dst_width) {
98 asm volatile (
99 "1: \n"
100 MEMACCESS(0)
101 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
102 "subs %w2, %w2, #8 \n" // 8 processed per loop
103 MEMACCESS(1)
104 "st1 {v2.8b}, [%1], #8 \n"
105 "b.gt 1b \n"
106 : "+r"(src_ptr), // %0
107 "+r"(dst_ptr), // %1
108 "+r"(dst_width) // %2
109 :
110 : "v0", "v1", "v2", "v3", "memory", "cc"
111 );
112 }
113
ScaleRowDown4Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)114 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
115 uint8* dst_ptr, int dst_width) {
116 const uint8* src_ptr1 = src_ptr + src_stride;
117 const uint8* src_ptr2 = src_ptr + src_stride * 2;
118 const uint8* src_ptr3 = src_ptr + src_stride * 3;
119 asm volatile (
120 "1: \n"
121 MEMACCESS(0)
122 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
123 MEMACCESS(3)
124 "ld1 {v1.16b}, [%2], #16 \n"
125 MEMACCESS(4)
126 "ld1 {v2.16b}, [%3], #16 \n"
127 MEMACCESS(5)
128 "ld1 {v3.16b}, [%4], #16 \n"
129 "subs %w5, %w5, #4 \n"
130 "uaddlp v0.8h, v0.16b \n"
131 "uadalp v0.8h, v1.16b \n"
132 "uadalp v0.8h, v2.16b \n"
133 "uadalp v0.8h, v3.16b \n"
134 "addp v0.8h, v0.8h, v0.8h \n"
135 "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
136 MEMACCESS(1)
137 "st1 {v0.s}[0], [%1], #4 \n"
138 "b.gt 1b \n"
139 : "+r"(src_ptr), // %0
140 "+r"(dst_ptr), // %1
141 "+r"(src_ptr1), // %2
142 "+r"(src_ptr2), // %3
143 "+r"(src_ptr3), // %4
144 "+r"(dst_width) // %5
145 :
146 : "v0", "v1", "v2", "v3", "memory", "cc"
147 );
148 }
149
150 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
151 // to load up the every 4th pixel into a 4 different registers.
152 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)153 void ScaleRowDown34_NEON(const uint8* src_ptr,
154 ptrdiff_t src_stride,
155 uint8* dst_ptr, int dst_width) {
156 asm volatile (
157 "1: \n"
158 MEMACCESS(0)
159 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
160 "subs %w2, %w2, #24 \n"
161 "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
162 MEMACCESS(1)
163 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
164 "b.gt 1b \n"
165 : "+r"(src_ptr), // %0
166 "+r"(dst_ptr), // %1
167 "+r"(dst_width) // %2
168 :
169 : "v0", "v1", "v2", "v3", "memory", "cc"
170 );
171 }
172
ScaleRowDown34_0_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)173 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
174 ptrdiff_t src_stride,
175 uint8* dst_ptr, int dst_width) {
176 asm volatile (
177 "movi v20.8b, #3 \n"
178 "add %3, %3, %0 \n"
179 "1: \n"
180 MEMACCESS(0)
181 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
182 MEMACCESS(3)
183 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
184 "subs %w2, %w2, #24 \n"
185
186 // filter src line 0 with src line 1
187 // expand chars to shorts to allow for room
188 // when adding lines together
189 "ushll v16.8h, v4.8b, #0 \n"
190 "ushll v17.8h, v5.8b, #0 \n"
191 "ushll v18.8h, v6.8b, #0 \n"
192 "ushll v19.8h, v7.8b, #0 \n"
193
194 // 3 * line_0 + line_1
195 "umlal v16.8h, v0.8b, v20.8b \n"
196 "umlal v17.8h, v1.8b, v20.8b \n"
197 "umlal v18.8h, v2.8b, v20.8b \n"
198 "umlal v19.8h, v3.8b, v20.8b \n"
199
200 // (3 * line_0 + line_1) >> 2
201 "uqrshrn v0.8b, v16.8h, #2 \n"
202 "uqrshrn v1.8b, v17.8h, #2 \n"
203 "uqrshrn v2.8b, v18.8h, #2 \n"
204 "uqrshrn v3.8b, v19.8h, #2 \n"
205
206 // a0 = (src[0] * 3 + s[1] * 1) >> 2
207 "ushll v16.8h, v1.8b, #0 \n"
208 "umlal v16.8h, v0.8b, v20.8b \n"
209 "uqrshrn v0.8b, v16.8h, #2 \n"
210
211 // a1 = (src[1] * 1 + s[2] * 1) >> 1
212 "urhadd v1.8b, v1.8b, v2.8b \n"
213
214 // a2 = (src[2] * 1 + s[3] * 3) >> 2
215 "ushll v16.8h, v2.8b, #0 \n"
216 "umlal v16.8h, v3.8b, v20.8b \n"
217 "uqrshrn v2.8b, v16.8h, #2 \n"
218
219 MEMACCESS(1)
220 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
221
222 "b.gt 1b \n"
223 : "+r"(src_ptr), // %0
224 "+r"(dst_ptr), // %1
225 "+r"(dst_width), // %2
226 "+r"(src_stride) // %3
227 :
228 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
229 "v20", "memory", "cc"
230 );
231 }
232
ScaleRowDown34_1_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)233 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
234 ptrdiff_t src_stride,
235 uint8* dst_ptr, int dst_width) {
236 asm volatile (
237 "movi v20.8b, #3 \n"
238 "add %3, %3, %0 \n"
239 "1: \n"
240 MEMACCESS(0)
241 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
242 MEMACCESS(3)
243 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
244 "subs %w2, %w2, #24 \n"
245 // average src line 0 with src line 1
246 "urhadd v0.8b, v0.8b, v4.8b \n"
247 "urhadd v1.8b, v1.8b, v5.8b \n"
248 "urhadd v2.8b, v2.8b, v6.8b \n"
249 "urhadd v3.8b, v3.8b, v7.8b \n"
250
251 // a0 = (src[0] * 3 + s[1] * 1) >> 2
252 "ushll v4.8h, v1.8b, #0 \n"
253 "umlal v4.8h, v0.8b, v20.8b \n"
254 "uqrshrn v0.8b, v4.8h, #2 \n"
255
256 // a1 = (src[1] * 1 + s[2] * 1) >> 1
257 "urhadd v1.8b, v1.8b, v2.8b \n"
258
259 // a2 = (src[2] * 1 + s[3] * 3) >> 2
260 "ushll v4.8h, v2.8b, #0 \n"
261 "umlal v4.8h, v3.8b, v20.8b \n"
262 "uqrshrn v2.8b, v4.8h, #2 \n"
263
264 MEMACCESS(1)
265 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
266 "b.gt 1b \n"
267 : "+r"(src_ptr), // %0
268 "+r"(dst_ptr), // %1
269 "+r"(dst_width), // %2
270 "+r"(src_stride) // %3
271 :
272 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
273 );
274 }
275
276 static uvec8 kShuf38 =
277 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
278 static uvec8 kShuf38_2 =
279 { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
280 static vec16 kMult38_Div6 =
281 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
282 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
283 static vec16 kMult38_Div9 =
284 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
285 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
286
287 // 32 -> 12
ScaleRowDown38_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)288 void ScaleRowDown38_NEON(const uint8* src_ptr,
289 ptrdiff_t src_stride,
290 uint8* dst_ptr, int dst_width) {
291 asm volatile (
292 MEMACCESS(3)
293 "ld1 {v3.16b}, [%3] \n"
294 "1: \n"
295 MEMACCESS(0)
296 "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
297 "subs %w2, %w2, #12 \n"
298 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
299 MEMACCESS(1)
300 "st1 {v2.8b}, [%1], #8 \n"
301 MEMACCESS(1)
302 "st1 {v2.s}[2], [%1], #4 \n"
303 "b.gt 1b \n"
304 : "+r"(src_ptr), // %0
305 "+r"(dst_ptr), // %1
306 "+r"(dst_width) // %2
307 : "r"(&kShuf38) // %3
308 : "v0", "v1", "v2", "v3", "memory", "cc"
309 );
310 }
311
312 // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)313 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
314 ptrdiff_t src_stride,
315 uint8* dst_ptr, int dst_width) {
316 const uint8* src_ptr1 = src_ptr + src_stride * 2;
317 ptrdiff_t tmp_src_stride = src_stride;
318
319 asm volatile (
320 MEMACCESS(5)
321 "ld1 {v29.8h}, [%5] \n"
322 MEMACCESS(6)
323 "ld1 {v30.16b}, [%6] \n"
324 MEMACCESS(7)
325 "ld1 {v31.8h}, [%7] \n"
326 "add %2, %2, %0 \n"
327 "1: \n"
328
329 // 00 40 01 41 02 42 03 43
330 // 10 50 11 51 12 52 13 53
331 // 20 60 21 61 22 62 23 63
332 // 30 70 31 71 32 72 33 73
333 MEMACCESS(0)
334 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
335 MEMACCESS(3)
336 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
337 MEMACCESS(4)
338 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
339 "subs %w4, %w4, #12 \n"
340
341 // Shuffle the input data around to get align the data
342 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
343 // 00 10 01 11 02 12 03 13
344 // 40 50 41 51 42 52 43 53
345 "trn1 v20.8b, v0.8b, v1.8b \n"
346 "trn2 v21.8b, v0.8b, v1.8b \n"
347 "trn1 v22.8b, v4.8b, v5.8b \n"
348 "trn2 v23.8b, v4.8b, v5.8b \n"
349 "trn1 v24.8b, v16.8b, v17.8b \n"
350 "trn2 v25.8b, v16.8b, v17.8b \n"
351
352 // 20 30 21 31 22 32 23 33
353 // 60 70 61 71 62 72 63 73
354 "trn1 v0.8b, v2.8b, v3.8b \n"
355 "trn2 v1.8b, v2.8b, v3.8b \n"
356 "trn1 v4.8b, v6.8b, v7.8b \n"
357 "trn2 v5.8b, v6.8b, v7.8b \n"
358 "trn1 v16.8b, v18.8b, v19.8b \n"
359 "trn2 v17.8b, v18.8b, v19.8b \n"
360
361 // 00+10 01+11 02+12 03+13
362 // 40+50 41+51 42+52 43+53
363 "uaddlp v20.4h, v20.8b \n"
364 "uaddlp v21.4h, v21.8b \n"
365 "uaddlp v22.4h, v22.8b \n"
366 "uaddlp v23.4h, v23.8b \n"
367 "uaddlp v24.4h, v24.8b \n"
368 "uaddlp v25.4h, v25.8b \n"
369
370 // 60+70 61+71 62+72 63+73
371 "uaddlp v1.4h, v1.8b \n"
372 "uaddlp v5.4h, v5.8b \n"
373 "uaddlp v17.4h, v17.8b \n"
374
375 // combine source lines
376 "add v20.4h, v20.4h, v22.4h \n"
377 "add v21.4h, v21.4h, v23.4h \n"
378 "add v20.4h, v20.4h, v24.4h \n"
379 "add v21.4h, v21.4h, v25.4h \n"
380 "add v2.4h, v1.4h, v5.4h \n"
381 "add v2.4h, v2.4h, v17.4h \n"
382
383 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
384 // + s[6 + st * 1] + s[7 + st * 1]
385 // + s[6 + st * 2] + s[7 + st * 2]) / 6
386 "sqrdmulh v2.8h, v2.8h, v29.8h \n"
387 "xtn v2.8b, v2.8h \n"
388
389 // Shuffle 2,3 reg around so that 2 can be added to the
390 // 0,1 reg and 3 can be added to the 4,5 reg. This
391 // requires expanding from u8 to u16 as the 0,1 and 4,5
392 // registers are already expanded. Then do transposes
393 // to get aligned.
394 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
395 "ushll v16.8h, v16.8b, #0 \n"
396 "uaddl v0.8h, v0.8b, v4.8b \n"
397
398 // combine source lines
399 "add v0.8h, v0.8h, v16.8h \n"
400
401 // xx 20 xx 21 xx 22 xx 23
402 // xx 30 xx 31 xx 32 xx 33
403 "trn1 v1.8h, v0.8h, v0.8h \n"
404 "trn2 v4.8h, v0.8h, v0.8h \n"
405 "xtn v0.4h, v1.4s \n"
406 "xtn v4.4h, v4.4s \n"
407
408 // 0+1+2, 3+4+5
409 "add v20.8h, v20.8h, v0.8h \n"
410 "add v21.8h, v21.8h, v4.8h \n"
411
412 // Need to divide, but can't downshift as the the value
413 // isn't a power of 2. So multiply by 65536 / n
414 // and take the upper 16 bits.
415 "sqrdmulh v0.8h, v20.8h, v31.8h \n"
416 "sqrdmulh v1.8h, v21.8h, v31.8h \n"
417
418 // Align for table lookup, vtbl requires registers to
419 // be adjacent
420 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
421
422 MEMACCESS(1)
423 "st1 {v3.8b}, [%1], #8 \n"
424 MEMACCESS(1)
425 "st1 {v3.s}[2], [%1], #4 \n"
426 "b.gt 1b \n"
427 : "+r"(src_ptr), // %0
428 "+r"(dst_ptr), // %1
429 "+r"(tmp_src_stride), // %2
430 "+r"(src_ptr1), // %3
431 "+r"(dst_width) // %4
432 : "r"(&kMult38_Div6), // %5
433 "r"(&kShuf38_2), // %6
434 "r"(&kMult38_Div9) // %7
435 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
436 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
437 "v30", "v31", "memory", "cc"
438 );
439 }
440
441 // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)442 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
443 ptrdiff_t src_stride,
444 uint8* dst_ptr, int dst_width) {
445 // TODO(fbarchard): use src_stride directly for clang 3.5+.
446 ptrdiff_t tmp_src_stride = src_stride;
447 asm volatile (
448 MEMACCESS(4)
449 "ld1 {v30.8h}, [%4] \n"
450 MEMACCESS(5)
451 "ld1 {v31.16b}, [%5] \n"
452 "add %2, %2, %0 \n"
453 "1: \n"
454
455 // 00 40 01 41 02 42 03 43
456 // 10 50 11 51 12 52 13 53
457 // 20 60 21 61 22 62 23 63
458 // 30 70 31 71 32 72 33 73
459 MEMACCESS(0)
460 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
461 MEMACCESS(3)
462 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
463 "subs %w3, %w3, #12 \n"
464
465 // Shuffle the input data around to get align the data
466 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
467 // 00 10 01 11 02 12 03 13
468 // 40 50 41 51 42 52 43 53
469 "trn1 v16.8b, v0.8b, v1.8b \n"
470 "trn2 v17.8b, v0.8b, v1.8b \n"
471 "trn1 v18.8b, v4.8b, v5.8b \n"
472 "trn2 v19.8b, v4.8b, v5.8b \n"
473
474 // 20 30 21 31 22 32 23 33
475 // 60 70 61 71 62 72 63 73
476 "trn1 v0.8b, v2.8b, v3.8b \n"
477 "trn2 v1.8b, v2.8b, v3.8b \n"
478 "trn1 v4.8b, v6.8b, v7.8b \n"
479 "trn2 v5.8b, v6.8b, v7.8b \n"
480
481 // 00+10 01+11 02+12 03+13
482 // 40+50 41+51 42+52 43+53
483 "uaddlp v16.4h, v16.8b \n"
484 "uaddlp v17.4h, v17.8b \n"
485 "uaddlp v18.4h, v18.8b \n"
486 "uaddlp v19.4h, v19.8b \n"
487
488 // 60+70 61+71 62+72 63+73
489 "uaddlp v1.4h, v1.8b \n"
490 "uaddlp v5.4h, v5.8b \n"
491
492 // combine source lines
493 "add v16.4h, v16.4h, v18.4h \n"
494 "add v17.4h, v17.4h, v19.4h \n"
495 "add v2.4h, v1.4h, v5.4h \n"
496
497 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
498 "uqrshrn v2.8b, v2.8h, #2 \n"
499
500 // Shuffle 2,3 reg around so that 2 can be added to the
501 // 0,1 reg and 3 can be added to the 4,5 reg. This
502 // requires expanding from u8 to u16 as the 0,1 and 4,5
503 // registers are already expanded. Then do transposes
504 // to get aligned.
505 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
506
507 // combine source lines
508 "uaddl v0.8h, v0.8b, v4.8b \n"
509
510 // xx 20 xx 21 xx 22 xx 23
511 // xx 30 xx 31 xx 32 xx 33
512 "trn1 v1.8h, v0.8h, v0.8h \n"
513 "trn2 v4.8h, v0.8h, v0.8h \n"
514 "xtn v0.4h, v1.4s \n"
515 "xtn v4.4h, v4.4s \n"
516
517 // 0+1+2, 3+4+5
518 "add v16.8h, v16.8h, v0.8h \n"
519 "add v17.8h, v17.8h, v4.8h \n"
520
521 // Need to divide, but can't downshift as the the value
522 // isn't a power of 2. So multiply by 65536 / n
523 // and take the upper 16 bits.
524 "sqrdmulh v0.8h, v16.8h, v30.8h \n"
525 "sqrdmulh v1.8h, v17.8h, v30.8h \n"
526
527 // Align for table lookup, vtbl requires registers to
528 // be adjacent
529
530 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
531
532 MEMACCESS(1)
533 "st1 {v3.8b}, [%1], #8 \n"
534 MEMACCESS(1)
535 "st1 {v3.s}[2], [%1], #4 \n"
536 "b.gt 1b \n"
537 : "+r"(src_ptr), // %0
538 "+r"(dst_ptr), // %1
539 "+r"(tmp_src_stride), // %2
540 "+r"(dst_width) // %3
541 : "r"(&kMult38_Div6), // %4
542 "r"(&kShuf38_2) // %5
543 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
544 "v18", "v19", "v30", "v31", "memory", "cc"
545 );
546 }
547
ScaleAddRows_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint16 * dst_ptr,int src_width,int src_height)548 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
549 uint16* dst_ptr, int src_width, int src_height) {
550 const uint8* src_tmp = NULL;
551 asm volatile (
552 "1: \n"
553 "mov %0, %1 \n"
554 "mov w12, %w5 \n"
555 "eor v2.16b, v2.16b, v2.16b \n"
556 "eor v3.16b, v3.16b, v3.16b \n"
557 "2: \n"
558 // load 16 pixels into q0
559 MEMACCESS(0)
560 "ld1 {v0.16b}, [%0], %3 \n"
561 "uaddw2 v3.8h, v3.8h, v0.16b \n"
562 "uaddw v2.8h, v2.8h, v0.8b \n"
563 "subs w12, w12, #1 \n"
564 "b.gt 2b \n"
565 MEMACCESS(2)
566 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
567 "add %1, %1, #16 \n"
568 "subs %w4, %w4, #16 \n" // 16 processed per loop
569 "b.gt 1b \n"
570 : "+r"(src_tmp), // %0
571 "+r"(src_ptr), // %1
572 "+r"(dst_ptr), // %2
573 "+r"(src_stride), // %3
574 "+r"(src_width), // %4
575 "+r"(src_height) // %5
576 :
577 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List
578 );
579 }
580
581 // TODO(Yang Zhang): Investigate less load instructions for
582 // the x/dx stepping
583 #define LOAD2_DATA8_LANE(n) \
584 "lsr %5, %3, #16 \n" \
585 "add %6, %1, %5 \n" \
586 "add %3, %3, %4 \n" \
587 MEMACCESS(6) \
588 "ld2 {v4.b, v5.b}["#n"], [%6] \n"
589
ScaleFilterCols_NEON(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)590 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
591 int dst_width, int x, int dx) {
592 int dx_offset[4] = {0, 1, 2, 3};
593 int* tmp = dx_offset;
594 const uint8* src_tmp = src_ptr;
595 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
596 int64 x64 = (int64) x;
597 int64 dx64 = (int64) dx;
598 asm volatile (
599 "dup v0.4s, %w3 \n" // x
600 "dup v1.4s, %w4 \n" // dx
601 "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
602 "shl v3.4s, v1.4s, #2 \n" // 4 * dx
603 "mul v1.4s, v1.4s, v2.4s \n"
604 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
605 "add v1.4s, v1.4s, v0.4s \n"
606 // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
607 "add v2.4s, v1.4s, v3.4s \n"
608 "shl v0.4s, v3.4s, #1 \n" // 8 * dx
609 "1: \n"
610 LOAD2_DATA8_LANE(0)
611 LOAD2_DATA8_LANE(1)
612 LOAD2_DATA8_LANE(2)
613 LOAD2_DATA8_LANE(3)
614 LOAD2_DATA8_LANE(4)
615 LOAD2_DATA8_LANE(5)
616 LOAD2_DATA8_LANE(6)
617 LOAD2_DATA8_LANE(7)
618 "mov v6.16b, v1.16b \n"
619 "mov v7.16b, v2.16b \n"
620 "uzp1 v6.8h, v6.8h, v7.8h \n"
621 "ushll v4.8h, v4.8b, #0 \n"
622 "ushll v5.8h, v5.8b, #0 \n"
623 "ssubl v16.4s, v5.4h, v4.4h \n"
624 "ssubl2 v17.4s, v5.8h, v4.8h \n"
625 "ushll v7.4s, v6.4h, #0 \n"
626 "ushll2 v6.4s, v6.8h, #0 \n"
627 "mul v16.4s, v16.4s, v7.4s \n"
628 "mul v17.4s, v17.4s, v6.4s \n"
629 "shrn v6.4h, v16.4s, #16 \n"
630 "shrn2 v6.8h, v17.4s, #16 \n"
631 "add v4.8h, v4.8h, v6.8h \n"
632 "xtn v4.8b, v4.8h \n"
633
634 MEMACCESS(0)
635 "st1 {v4.8b}, [%0], #8 \n" // store pixels
636 "add v1.4s, v1.4s, v0.4s \n"
637 "add v2.4s, v2.4s, v0.4s \n"
638 "subs %w2, %w2, #8 \n" // 8 processed per loop
639 "b.gt 1b \n"
640 : "+r"(dst_ptr), // %0
641 "+r"(src_ptr), // %1
642 "+r"(dst_width64), // %2
643 "+r"(x64), // %3
644 "+r"(dx64), // %4
645 "+r"(tmp), // %5
646 "+r"(src_tmp) // %6
647 :
648 : "memory", "cc", "v0", "v1", "v2", "v3",
649 "v4", "v5", "v6", "v7", "v16", "v17"
650 );
651 }
652
653 #undef LOAD2_DATA8_LANE
654
655 // 16x2 -> 16x1
ScaleFilterRows_NEON(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)656 void ScaleFilterRows_NEON(uint8* dst_ptr,
657 const uint8* src_ptr, ptrdiff_t src_stride,
658 int dst_width, int source_y_fraction) {
659 int y_fraction = 256 - source_y_fraction;
660 asm volatile (
661 "cmp %w4, #0 \n"
662 "b.eq 100f \n"
663 "add %2, %2, %1 \n"
664 "cmp %w4, #64 \n"
665 "b.eq 75f \n"
666 "cmp %w4, #128 \n"
667 "b.eq 50f \n"
668 "cmp %w4, #192 \n"
669 "b.eq 25f \n"
670
671 "dup v5.8b, %w4 \n"
672 "dup v4.8b, %w5 \n"
673 // General purpose row blend.
674 "1: \n"
675 MEMACCESS(1)
676 "ld1 {v0.16b}, [%1], #16 \n"
677 MEMACCESS(2)
678 "ld1 {v1.16b}, [%2], #16 \n"
679 "subs %w3, %w3, #16 \n"
680 "umull v6.8h, v0.8b, v4.8b \n"
681 "umull2 v7.8h, v0.16b, v4.16b \n"
682 "umlal v6.8h, v1.8b, v5.8b \n"
683 "umlal2 v7.8h, v1.16b, v5.16b \n"
684 "rshrn v0.8b, v6.8h, #8 \n"
685 "rshrn2 v0.16b, v7.8h, #8 \n"
686 MEMACCESS(0)
687 "st1 {v0.16b}, [%0], #16 \n"
688 "b.gt 1b \n"
689 "b 99f \n"
690
691 // Blend 25 / 75.
692 "25: \n"
693 MEMACCESS(1)
694 "ld1 {v0.16b}, [%1], #16 \n"
695 MEMACCESS(2)
696 "ld1 {v1.16b}, [%2], #16 \n"
697 "subs %w3, %w3, #16 \n"
698 "urhadd v0.16b, v0.16b, v1.16b \n"
699 "urhadd v0.16b, v0.16b, v1.16b \n"
700 MEMACCESS(0)
701 "st1 {v0.16b}, [%0], #16 \n"
702 "b.gt 25b \n"
703 "b 99f \n"
704
705 // Blend 50 / 50.
706 "50: \n"
707 MEMACCESS(1)
708 "ld1 {v0.16b}, [%1], #16 \n"
709 MEMACCESS(2)
710 "ld1 {v1.16b}, [%2], #16 \n"
711 "subs %w3, %w3, #16 \n"
712 "urhadd v0.16b, v0.16b, v1.16b \n"
713 MEMACCESS(0)
714 "st1 {v0.16b}, [%0], #16 \n"
715 "b.gt 50b \n"
716 "b 99f \n"
717
718 // Blend 75 / 25.
719 "75: \n"
720 MEMACCESS(1)
721 "ld1 {v1.16b}, [%1], #16 \n"
722 MEMACCESS(2)
723 "ld1 {v0.16b}, [%2], #16 \n"
724 "subs %w3, %w3, #16 \n"
725 "urhadd v0.16b, v0.16b, v1.16b \n"
726 "urhadd v0.16b, v0.16b, v1.16b \n"
727 MEMACCESS(0)
728 "st1 {v0.16b}, [%0], #16 \n"
729 "b.gt 75b \n"
730 "b 99f \n"
731
732 // Blend 100 / 0 - Copy row unchanged.
733 "100: \n"
734 MEMACCESS(1)
735 "ld1 {v0.16b}, [%1], #16 \n"
736 "subs %w3, %w3, #16 \n"
737 MEMACCESS(0)
738 "st1 {v0.16b}, [%0], #16 \n"
739 "b.gt 100b \n"
740
741 "99: \n"
742 MEMACCESS(0)
743 "st1 {v0.b}[15], [%0] \n"
744 : "+r"(dst_ptr), // %0
745 "+r"(src_ptr), // %1
746 "+r"(src_stride), // %2
747 "+r"(dst_width), // %3
748 "+r"(source_y_fraction),// %4
749 "+r"(y_fraction) // %5
750 :
751 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
752 );
753 }
754
ScaleARGBRowDown2_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)755 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
756 uint8* dst, int dst_width) {
757 asm volatile (
758 "1: \n"
759 // load even pixels into q0, odd into q1
760 MEMACCESS (0)
761 "ld2 {v0.4s, v1.4s}, [%0], #32 \n"
762 MEMACCESS (0)
763 "ld2 {v2.4s, v3.4s}, [%0], #32 \n"
764 "subs %w2, %w2, #8 \n" // 8 processed per loop
765 MEMACCESS (1)
766 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
767 MEMACCESS (1)
768 "st1 {v3.16b}, [%1], #16 \n"
769 "b.gt 1b \n"
770 : "+r" (src_ptr), // %0
771 "+r" (dst), // %1
772 "+r" (dst_width) // %2
773 :
774 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
775 );
776 }
777
ScaleARGBRowDown2Linear_NEON(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)778 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
779 uint8* dst_argb, int dst_width) {
780 asm volatile (
781 "1: \n"
782 MEMACCESS (0)
783 // load 8 ARGB pixels.
784 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
785 "subs %w2, %w2, #8 \n" // 8 processed per loop.
786 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
787 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
788 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
789 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
790 "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
791 "rshrn v1.8b, v1.8h, #1 \n"
792 "rshrn v2.8b, v2.8h, #1 \n"
793 "rshrn v3.8b, v3.8h, #1 \n"
794 MEMACCESS (1)
795 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
796 "b.gt 1b \n"
797 : "+r"(src_argb), // %0
798 "+r"(dst_argb), // %1
799 "+r"(dst_width) // %2
800 :
801 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
802 );
803 }
804
ScaleARGBRowDown2Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)805 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
806 uint8* dst, int dst_width) {
807 asm volatile (
808 // change the stride to row 2 pointer
809 "add %1, %1, %0 \n"
810 "1: \n"
811 MEMACCESS (0)
812 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
813 "subs %w3, %w3, #8 \n" // 8 processed per loop.
814 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
815 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
816 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
817 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
818 MEMACCESS (1)
819 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
820 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
821 "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
822 "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
823 "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
824 "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
825 "rshrn v1.8b, v1.8h, #2 \n"
826 "rshrn v2.8b, v2.8h, #2 \n"
827 "rshrn v3.8b, v3.8h, #2 \n"
828 MEMACCESS (2)
829 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
830 "b.gt 1b \n"
831 : "+r" (src_ptr), // %0
832 "+r" (src_stride), // %1
833 "+r" (dst), // %2
834 "+r" (dst_width) // %3
835 :
836 : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
837 );
838 }
839
840 // Reads 4 pixels at a time.
841 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)842 void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
843 int src_stepx, uint8* dst_argb, int dst_width) {
844 asm volatile (
845 "1: \n"
846 MEMACCESS(0)
847 "ld1 {v0.s}[0], [%0], %3 \n"
848 MEMACCESS(0)
849 "ld1 {v0.s}[1], [%0], %3 \n"
850 MEMACCESS(0)
851 "ld1 {v0.s}[2], [%0], %3 \n"
852 MEMACCESS(0)
853 "ld1 {v0.s}[3], [%0], %3 \n"
854 "subs %w2, %w2, #4 \n" // 4 pixels per loop.
855 MEMACCESS(1)
856 "st1 {v0.16b}, [%1], #16 \n"
857 "b.gt 1b \n"
858 : "+r"(src_argb), // %0
859 "+r"(dst_argb), // %1
860 "+r"(dst_width) // %2
861 : "r"((int64)(src_stepx * 4)) // %3
862 : "memory", "cc", "v0"
863 );
864 }
865
866 // Reads 4 pixels at a time.
867 // Alignment requirement: src_argb 4 byte aligned.
868 // TODO(Yang Zhang): Might be worth another optimization pass in future.
869 // It could be upgraded to 8 pixels at a time to start with.
ScaleARGBRowDownEvenBox_NEON(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)870 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
871 int src_stepx,
872 uint8* dst_argb, int dst_width) {
873 asm volatile (
874 "add %1, %1, %0 \n"
875 "1: \n"
876 MEMACCESS(0)
877 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
878 MEMACCESS(1)
879 "ld1 {v1.8b}, [%1], %4 \n"
880 MEMACCESS(0)
881 "ld1 {v2.8b}, [%0], %4 \n"
882 MEMACCESS(1)
883 "ld1 {v3.8b}, [%1], %4 \n"
884 MEMACCESS(0)
885 "ld1 {v4.8b}, [%0], %4 \n"
886 MEMACCESS(1)
887 "ld1 {v5.8b}, [%1], %4 \n"
888 MEMACCESS(0)
889 "ld1 {v6.8b}, [%0], %4 \n"
890 MEMACCESS(1)
891 "ld1 {v7.8b}, [%1], %4 \n"
892 "uaddl v0.8h, v0.8b, v1.8b \n"
893 "uaddl v2.8h, v2.8b, v3.8b \n"
894 "uaddl v4.8h, v4.8b, v5.8b \n"
895 "uaddl v6.8h, v6.8b, v7.8b \n"
896 "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
897 "mov v0.d[1], v2.d[0] \n"
898 "mov v2.d[0], v16.d[1] \n"
899 "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
900 "mov v4.d[1], v6.d[0] \n"
901 "mov v6.d[0], v16.d[1] \n"
902 "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
903 "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
904 "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
905 "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
906 "subs %w3, %w3, #4 \n" // 4 pixels per loop.
907 MEMACCESS(2)
908 "st1 {v0.16b}, [%2], #16 \n"
909 "b.gt 1b \n"
910 : "+r"(src_argb), // %0
911 "+r"(src_stride), // %1
912 "+r"(dst_argb), // %2
913 "+r"(dst_width) // %3
914 : "r"((int64)(src_stepx * 4)) // %4
915 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
916 );
917 }
918
919 // TODO(Yang Zhang): Investigate less load instructions for
920 // the x/dx stepping
921 #define LOAD1_DATA32_LANE(vn, n) \
922 "lsr %5, %3, #16 \n" \
923 "add %6, %1, %5, lsl #2 \n" \
924 "add %3, %3, %4 \n" \
925 MEMACCESS(6) \
926 "ld1 {"#vn".s}["#n"], [%6] \n"
927
ScaleARGBCols_NEON(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)928 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
929 int dst_width, int x, int dx) {
930 const uint8* src_tmp = src_argb;
931 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
932 int64 x64 = (int64) x;
933 int64 dx64 = (int64) dx;
934 int64 tmp64 = 0;
935 asm volatile (
936 "1: \n"
937 LOAD1_DATA32_LANE(v0, 0)
938 LOAD1_DATA32_LANE(v0, 1)
939 LOAD1_DATA32_LANE(v0, 2)
940 LOAD1_DATA32_LANE(v0, 3)
941 LOAD1_DATA32_LANE(v1, 0)
942 LOAD1_DATA32_LANE(v1, 1)
943 LOAD1_DATA32_LANE(v1, 2)
944 LOAD1_DATA32_LANE(v1, 3)
945
946 MEMACCESS(0)
947 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
948 "subs %w2, %w2, #8 \n" // 8 processed per loop
949 "b.gt 1b \n"
950 : "+r"(dst_argb), // %0
951 "+r"(src_argb), // %1
952 "+r"(dst_width64), // %2
953 "+r"(x64), // %3
954 "+r"(dx64), // %4
955 "+r"(tmp64), // %5
956 "+r"(src_tmp) // %6
957 :
958 : "memory", "cc", "v0", "v1"
959 );
960 }
961
962 #undef LOAD1_DATA32_LANE
963
964 // TODO(Yang Zhang): Investigate less load instructions for
965 // the x/dx stepping
966 #define LOAD2_DATA32_LANE(vn1, vn2, n) \
967 "lsr %5, %3, #16 \n" \
968 "add %6, %1, %5, lsl #2 \n" \
969 "add %3, %3, %4 \n" \
970 MEMACCESS(6) \
971 "ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n"
972
ScaleARGBFilterCols_NEON(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)973 void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
974 int dst_width, int x, int dx) {
975 int dx_offset[4] = {0, 1, 2, 3};
976 int* tmp = dx_offset;
977 const uint8* src_tmp = src_argb;
978 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
979 int64 x64 = (int64) x;
980 int64 dx64 = (int64) dx;
981 asm volatile (
982 "dup v0.4s, %w3 \n" // x
983 "dup v1.4s, %w4 \n" // dx
984 "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
985 "shl v6.4s, v1.4s, #2 \n" // 4 * dx
986 "mul v1.4s, v1.4s, v2.4s \n"
987 "movi v3.16b, #0x7f \n" // 0x7F
988 "movi v4.8h, #0x7f \n" // 0x7F
989 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
990 "add v5.4s, v1.4s, v0.4s \n"
991 "1: \n"
992 // d0, d1: a
993 // d2, d3: b
994 LOAD2_DATA32_LANE(v0, v1, 0)
995 LOAD2_DATA32_LANE(v0, v1, 1)
996 LOAD2_DATA32_LANE(v0, v1, 2)
997 LOAD2_DATA32_LANE(v0, v1, 3)
998 "shrn v2.4h, v5.4s, #9 \n"
999 "and v2.8b, v2.8b, v4.8b \n"
1000 "dup v16.8b, v2.b[0] \n"
1001 "dup v17.8b, v2.b[2] \n"
1002 "dup v18.8b, v2.b[4] \n"
1003 "dup v19.8b, v2.b[6] \n"
1004 "ext v2.8b, v16.8b, v17.8b, #4 \n"
1005 "ext v17.8b, v18.8b, v19.8b, #4 \n"
1006 "ins v2.d[1], v17.d[0] \n" // f
1007 "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f
1008 "umull v16.8h, v0.8b, v7.8b \n"
1009 "umull2 v17.8h, v0.16b, v7.16b \n"
1010 "umull v18.8h, v1.8b, v2.8b \n"
1011 "umull2 v19.8h, v1.16b, v2.16b \n"
1012 "add v16.8h, v16.8h, v18.8h \n"
1013 "add v17.8h, v17.8h, v19.8h \n"
1014 "shrn v0.8b, v16.8h, #7 \n"
1015 "shrn2 v0.16b, v17.8h, #7 \n"
1016
1017 MEMACCESS(0)
1018 "st1 {v0.4s}, [%0], #16 \n" // store pixels
1019 "add v5.4s, v5.4s, v6.4s \n"
1020 "subs %w2, %w2, #4 \n" // 4 processed per loop
1021 "b.gt 1b \n"
1022 : "+r"(dst_argb), // %0
1023 "+r"(src_argb), // %1
1024 "+r"(dst_width64), // %2
1025 "+r"(x64), // %3
1026 "+r"(dx64), // %4
1027 "+r"(tmp), // %5
1028 "+r"(src_tmp) // %6
1029 :
1030 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
1031 "v6", "v7", "v16", "v17", "v18", "v19"
1032 );
1033 }
1034
1035 #undef LOAD2_DATA32_LANE
1036
1037 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
1038
1039 #ifdef __cplusplus
1040 } // extern "C"
1041 } // namespace libyuv
1042 #endif
1043